xref: /onnv-gate/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_probe.c (revision 4929:132c9e4daa52)
10Sstevel@tonic-gate /*
2*4929Srk129064  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
30Sstevel@tonic-gate  * Use is subject to license terms.
40Sstevel@tonic-gate  */
50Sstevel@tonic-gate 
60Sstevel@tonic-gate /*
70Sstevel@tonic-gate  * Copyright (c) 1987 Regents of the University of California.
80Sstevel@tonic-gate  * All rights reserved.
90Sstevel@tonic-gate  *
100Sstevel@tonic-gate  * Redistribution and use in source and binary forms are permitted
110Sstevel@tonic-gate  * provided that the above copyright notice and this paragraph are
120Sstevel@tonic-gate  * duplicated in all such forms and that any documentation,
130Sstevel@tonic-gate  * advertising materials, and other materials related to such
140Sstevel@tonic-gate  * distribution and use acknowledge that the software was developed
150Sstevel@tonic-gate  * by the University of California, Berkeley. The name of the
160Sstevel@tonic-gate  * University may not be used to endorse or promote products derived
170Sstevel@tonic-gate  * from this software without specific prior written permission.
180Sstevel@tonic-gate  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
190Sstevel@tonic-gate  * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
200Sstevel@tonic-gate  * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
210Sstevel@tonic-gate  */
220Sstevel@tonic-gate 
230Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
240Sstevel@tonic-gate 
250Sstevel@tonic-gate #include "mpd_defs.h"
260Sstevel@tonic-gate #include "mpd_tables.h"
270Sstevel@tonic-gate 
280Sstevel@tonic-gate /*
290Sstevel@tonic-gate  * Probe types for probe()
300Sstevel@tonic-gate  */
310Sstevel@tonic-gate #define	PROBE_UNI	0x1234		/* Unicast probe packet */
320Sstevel@tonic-gate #define	PROBE_MULTI	0x5678		/* Multicast probe packet */
330Sstevel@tonic-gate #define	PROBE_RTT	0x9abc		/* RTT only probe packet */
340Sstevel@tonic-gate 
350Sstevel@tonic-gate #define	MSEC_PERMIN	(60 * MILLISEC)	/* Number of milliseconds in a minute */
360Sstevel@tonic-gate 
370Sstevel@tonic-gate /*
380Sstevel@tonic-gate  * Format of probe / probe response packets. This is an ICMP Echo request
390Sstevel@tonic-gate  * or ICMP Echo reply. Packet format is same for both IPv4 and IPv6
400Sstevel@tonic-gate  */
410Sstevel@tonic-gate struct pr_icmp
420Sstevel@tonic-gate {
430Sstevel@tonic-gate 	uint8_t  pr_icmp_type;		/* type field */
440Sstevel@tonic-gate 	uint8_t  pr_icmp_code;		/* code field */
450Sstevel@tonic-gate 	uint16_t pr_icmp_cksum;		/* checksum field */
460Sstevel@tonic-gate 	uint16_t pr_icmp_id;		/* Identification */
470Sstevel@tonic-gate 	uint16_t pr_icmp_seq;		/* sequence number */
480Sstevel@tonic-gate 	uint32_t pr_icmp_timestamp;	/* Time stamp	*/
490Sstevel@tonic-gate 	uint32_t pr_icmp_mtype;		/* Message type */
500Sstevel@tonic-gate };
510Sstevel@tonic-gate 
520Sstevel@tonic-gate static struct in6_addr all_nodes_mcast_v6 = { { 0xff, 0x2, 0x0, 0x0,
530Sstevel@tonic-gate 				    0x0, 0x0, 0x0, 0x0,
540Sstevel@tonic-gate 				    0x0, 0x0, 0x0, 0x0,
550Sstevel@tonic-gate 				    0x0, 0x0, 0x0, 0x1 } };
560Sstevel@tonic-gate 
570Sstevel@tonic-gate static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } };
580Sstevel@tonic-gate 
590Sstevel@tonic-gate static hrtime_t	last_fdt_bumpup_time;	/* When FDT was bumped up last */
600Sstevel@tonic-gate 
610Sstevel@tonic-gate static void		*find_ancillary(struct msghdr *msg, int cmsg_type);
620Sstevel@tonic-gate static void		pi_set_crtt(struct target *tg, int m,
630Sstevel@tonic-gate     boolean_t is_probe_uni);
640Sstevel@tonic-gate static void		incoming_echo_reply(struct phyint_instance *pii,
650Sstevel@tonic-gate     struct pr_icmp *reply, struct in6_addr fromaddr);
660Sstevel@tonic-gate static void		incoming_rtt_reply(struct phyint_instance *pii,
670Sstevel@tonic-gate     struct pr_icmp *reply, struct in6_addr fromaddr);
680Sstevel@tonic-gate static void		incoming_mcast_reply(struct phyint_instance *pii,
690Sstevel@tonic-gate     struct pr_icmp *reply, struct in6_addr fromaddr);
700Sstevel@tonic-gate 
710Sstevel@tonic-gate static boolean_t	check_pg_crtt_improved(struct phyint_group *pg);
720Sstevel@tonic-gate static boolean_t	check_pii_crtt_improved(struct phyint_instance *pii);
730Sstevel@tonic-gate static boolean_t	check_exception_target(struct phyint_instance *pii,
740Sstevel@tonic-gate     struct target *target);
750Sstevel@tonic-gate static void		probe_fail_info(struct phyint_instance *pii,
760Sstevel@tonic-gate     struct target *cur_tg, struct probe_fail_count *pfinfo);
770Sstevel@tonic-gate static void		probe_success_info(struct phyint_instance *pii,
780Sstevel@tonic-gate     struct target *cur_tg, struct probe_success_count *psinfo);
790Sstevel@tonic-gate static boolean_t	phyint_repaired(struct phyint *pi);
800Sstevel@tonic-gate 
810Sstevel@tonic-gate static int		failover(struct phyint *from, struct phyint *to);
820Sstevel@tonic-gate static int		failback(struct phyint *from, struct phyint *to);
830Sstevel@tonic-gate static struct phyint	*get_failover_dst(struct phyint *pi, int failover_type);
840Sstevel@tonic-gate 
850Sstevel@tonic-gate static boolean_t	highest_ack_tg(uint16_t seq, struct target *tg);
860Sstevel@tonic-gate static int 		in_cksum(ushort_t *addr, int len);
870Sstevel@tonic-gate static void		reset_snxt_basetimes(void);
880Sstevel@tonic-gate 
890Sstevel@tonic-gate /*
900Sstevel@tonic-gate  * CRTT - Conservative Round Trip Time Estimate
910Sstevel@tonic-gate  * Probe success - A matching probe reply received before CRTT ms has elapsed
920Sstevel@tonic-gate  *	after sending the probe.
930Sstevel@tonic-gate  * Probe failure - No probe reply received and more than CRTT ms has elapsed
940Sstevel@tonic-gate  *	after sending the probe.
950Sstevel@tonic-gate  *
960Sstevel@tonic-gate  * TLS - Time last success. Most recent probe ack received at this time.
970Sstevel@tonic-gate  * TFF - Time first fail. The time of the earliest probe failure in
980Sstevel@tonic-gate  *	a consecutive series of probe failures.
990Sstevel@tonic-gate  * NUM_PROBE_REPAIRS  - Number of consecutive successful probes required
1000Sstevel@tonic-gate  * 	before declaring phyint repair.
1010Sstevel@tonic-gate  * NUM_PROBE_FAILS - Number of consecutive probe failures required to
1020Sstevel@tonic-gate  *	declare a phyint failure.
1030Sstevel@tonic-gate  *
1040Sstevel@tonic-gate  * 			Phyint state diagram
1050Sstevel@tonic-gate  *
1060Sstevel@tonic-gate  * The state of a phyint that is capable of being probed, is completely
1070Sstevel@tonic-gate  * specified by the 5-tuple <pi_state, pg_groupfailed, I, pi_empty, pi_full>.
1080Sstevel@tonic-gate  *
1090Sstevel@tonic-gate  * A phyint starts in either PI_RUNNING or PI_FAILED, depending on the state
1100Sstevel@tonic-gate  * of the link (according to the driver).  If the phyint is also configured
1110Sstevel@tonic-gate  * with a test address (the common case) and probe targets, then a phyint must
1120Sstevel@tonic-gate  * also successfully be able to send and receive probes in order to remain in
1130Sstevel@tonic-gate  * the PI_RUNNING state (otherwise, it transitions to PI_FAILED).
1140Sstevel@tonic-gate  *
1150Sstevel@tonic-gate  * Further, if a PI_RUNNING phyint is configured with a test address but is
1160Sstevel@tonic-gate  * unable to find any probe targets, it will transition to the PI_NOTARGETS
1170Sstevel@tonic-gate  * state, which indicates that the link is apparently functional but that
1180Sstevel@tonic-gate  * in.mpathd is unable to send probes to verify functionality (in this case,
1190Sstevel@tonic-gate  * in.mpathd makes the optimistic assumption that the interface is working
1200Sstevel@tonic-gate  * correctly and thus does not perform a failover, but reports the interface
1210Sstevel@tonic-gate  * as IPMP_IF_UNKNOWN through the async events and query interfaces).
1220Sstevel@tonic-gate  *
1230Sstevel@tonic-gate  * At any point, a phyint may be administratively marked offline via if_mpadm.
1240Sstevel@tonic-gate  * In this case, the interface always transitions to PI_OFFLINE, regardless
1250Sstevel@tonic-gate  * of its previous state.  When the interface is later brought back online,
1260Sstevel@tonic-gate  * in.mpathd acts as if the interface is new (and thus it transitions to
1270Sstevel@tonic-gate  * PI_RUNNING or PI_FAILED based on the status of the link and the result of
1280Sstevel@tonic-gate  * its probes, if probes are sent).
1290Sstevel@tonic-gate  *
1300Sstevel@tonic-gate  * pi_state -  PI_RUNNING or PI_FAILED
1310Sstevel@tonic-gate  *	PI_RUNNING: The failure detection logic says the phyint is good.
1320Sstevel@tonic-gate  *	PI_FAILED: The failure detection logic says the phyint has failed.
1330Sstevel@tonic-gate  *
1340Sstevel@tonic-gate  * pg_groupfailed  - Group failure, all interfaces in the group have failed.
1350Sstevel@tonic-gate  *	The pi_state may be either PI_FAILED or PI_NOTARGETS.
1360Sstevel@tonic-gate  *	In the case of router targets, we assume that the current list of
1370Sstevel@tonic-gate  *	targets obtained from the routing table, is still valid, so the
1380Sstevel@tonic-gate  *	phyint stat is PI_FAILED. In the case of host targets, we delete the
1390Sstevel@tonic-gate  *	list of targets, and multicast to the all hosts, to reconstruct the
1400Sstevel@tonic-gate  *	target list. So the phyints are in the PI_NOTARGETS state.
1410Sstevel@tonic-gate  *
1420Sstevel@tonic-gate  * I -	value of (pi_flags & IFF_INACTIVE)
143704Sethindra  *	IFF_INACTIVE: No failovers have been done to this phyint, from
144704Sethindra  *		other phyints. This phyint is inactive. Phyint can be a Standby.
145704Sethindra  *		When failback has been disabled (FAILOVER=no configured),
146704Sethindra  *		phyint can also be a non-STANDBY. In this case IFF_INACTIVE
147704Sethindra  *		is set when phyint subsequently recovers after a failure.
1480Sstevel@tonic-gate  *
1490Sstevel@tonic-gate  * pi_empty
1500Sstevel@tonic-gate  *	This phyint has failed over successfully to another phyint, and
1510Sstevel@tonic-gate  *	this phyint is currently "empty". It does not host any addresses or
1520Sstevel@tonic-gate  *	multicast membership etc. This is the state of a phyint after a
1530Sstevel@tonic-gate  *	failover from the phyint has completed successfully and no subsequent
1540Sstevel@tonic-gate  *	'failover to' or 'failback to' has occurred on the phyint.
1550Sstevel@tonic-gate  *	IP guarantees that no new logicals will be hosted nor any multicast
1560Sstevel@tonic-gate  *	joins permitted on the phyint, since the phyint is either failed or
1570Sstevel@tonic-gate  *	inactive. pi_empty is set implies the phyint is either failed or
1580Sstevel@tonic-gate  *	inactive.
1590Sstevel@tonic-gate  *
1600Sstevel@tonic-gate  * pi_full
1610Sstevel@tonic-gate  *	The phyint hosts all of its own addresses that it "owns". If the
1620Sstevel@tonic-gate  *	phyint was previously failed or inactive, failbacks to the phyint
1630Sstevel@tonic-gate  *	has completed successfully. i.e. No more failbacks to this phyint
1640Sstevel@tonic-gate  *	can produce any change in system state whatsoever.
1650Sstevel@tonic-gate  *
1660Sstevel@tonic-gate  * Not all 32 possible combinations of the above 5-tuple are possible.
1670Sstevel@tonic-gate  * Furthermore some of the above combinations are transient. They may occur
1680Sstevel@tonic-gate  * only because the failover or failback did not complete successfully. The
1690Sstevel@tonic-gate  * failover/failback will be retried and eventually a stable state will be
1700Sstevel@tonic-gate  * reached.
1710Sstevel@tonic-gate  *
1720Sstevel@tonic-gate  * I is tracked by IP. pi_state, pi_empty and pi_full are tracked by mpathd.
1730Sstevel@tonic-gate  * The following are the state machines. 'from' and 'to' are the src and
1740Sstevel@tonic-gate  * dst of the failover/failback, below
1750Sstevel@tonic-gate  *
1760Sstevel@tonic-gate  *			pi_empty state machine
1770Sstevel@tonic-gate  * ---------------------------------------------------------------------------
1780Sstevel@tonic-gate  *	Event				State	->	New State
1790Sstevel@tonic-gate  * ---------------------------------------------------------------------------
1800Sstevel@tonic-gate  *	successful completion 		from.pi_empty = 0 -> from.pi_empty = 1
1810Sstevel@tonic-gate  *	of failover
1820Sstevel@tonic-gate  *
1830Sstevel@tonic-gate  *	Initiate failover 		to.pi_empty = X   -> to.pi_empty = 0
1840Sstevel@tonic-gate  *
1850Sstevel@tonic-gate  * 	Initiate failback 		to.pi_empty = X   -> to.pi_empty = 0
1860Sstevel@tonic-gate  *
1870Sstevel@tonic-gate  * 	group failure			pi_empty = X	  -> pi_empty = 0
1880Sstevel@tonic-gate  * ---------------------------------------------------------------------------
1890Sstevel@tonic-gate  *
1900Sstevel@tonic-gate  *			pi_full state machine
1910Sstevel@tonic-gate  * ---------------------------------------------------------------------------
1920Sstevel@tonic-gate  *	Event				State		  -> New State
1930Sstevel@tonic-gate  * ---------------------------------------------------------------------------
1940Sstevel@tonic-gate  *	successful completion		to.pi_full = 0    -> to.pi_full = 1
1950Sstevel@tonic-gate  *	of failback from
1960Sstevel@tonic-gate  *	each of the other phyints
1970Sstevel@tonic-gate  *
1980Sstevel@tonic-gate  *	Initiate failover 		from.pi_full = X  -> from.pi_full = 0
1990Sstevel@tonic-gate  *
2000Sstevel@tonic-gate  *	group failure			pi_full = X	  -> pi_full = 0
2010Sstevel@tonic-gate  * ---------------------------------------------------------------------------
2020Sstevel@tonic-gate  *
2030Sstevel@tonic-gate  *			pi_state state machine
2040Sstevel@tonic-gate  * ---------------------------------------------------------------------------
2050Sstevel@tonic-gate  *	Event			State			New State
2060Sstevel@tonic-gate  *				Action:
2070Sstevel@tonic-gate  * ---------------------------------------------------------------------------
2080Sstevel@tonic-gate  *	NIC failure		(PI_RUNNING, I == 0) -> (PI_FAILED, I == 0)
2090Sstevel@tonic-gate  *	detection		: set IFF_FAILED on this phyint
2100Sstevel@tonic-gate  *				: failover from this phyint to another
2110Sstevel@tonic-gate  *
212704Sethindra  *	NIC failure		(PI_RUNNING, I == 1) -> (PI_FAILED, I == 0)
2130Sstevel@tonic-gate  *	detection		: set IFF_FAILED on this phyint
2140Sstevel@tonic-gate  *
215704Sethindra  *	NIC repair 		(PI_FAILED, I == 0, FAILBACK=yes)
216704Sethindra  *	detection				     -> (PI_RUNNING, I == 0)
217704Sethindra  *				: to.pi_empty = 0
2180Sstevel@tonic-gate  *				: clear IFF_FAILED on this phyint
219704Sethindra  *				: failback to this phyint if enabled
2200Sstevel@tonic-gate  *
221704Sethindra  *	NIC repair 		(PI_FAILED, I == 0, FAILBACK=no)
222704Sethindra  *	detection				     ->	(PI_RUNNING, I == 1)
223704Sethindra  *				: to.pi_empty = 0
224704Sethindra  *				: clear IFF_FAILED on this phyint
225704Sethindra  *				: if failback is disabled set I == 1
2260Sstevel@tonic-gate  *
2270Sstevel@tonic-gate  *	Group failure		(perform on all phyints in the group)
2280Sstevel@tonic-gate  *	detection 		PI_RUNNING		PI_FAILED
2290Sstevel@tonic-gate  *	(Router targets)	: set IFF_FAILED
2300Sstevel@tonic-gate  *				: clear pi_empty and pi_full
2310Sstevel@tonic-gate  *
2320Sstevel@tonic-gate  *	Group failure		(perform on all phyints in the group)
2330Sstevel@tonic-gate  *	detection 		PI_RUNNING		PI_NOTARGETS
2340Sstevel@tonic-gate  *	(Host targets)		: set IFF_FAILED
2350Sstevel@tonic-gate  *				: clear pi_empty and pi_full
2360Sstevel@tonic-gate  *				: delete the target list on all phyints
2370Sstevel@tonic-gate  * ---------------------------------------------------------------------------
2380Sstevel@tonic-gate  *
2390Sstevel@tonic-gate  *			I state machine
2400Sstevel@tonic-gate  * ---------------------------------------------------------------------------
2410Sstevel@tonic-gate  *	Event		State			Action:
2420Sstevel@tonic-gate  * ---------------------------------------------------------------------------
243704Sethindra  *	Turn on I 	pi_empty == 0, STANDBY 	: failover from standby
2440Sstevel@tonic-gate  *
245704Sethindra  *	Turn off I 	PI_RUNNING, STANDBY	: pi_empty = 0
2460Sstevel@tonic-gate  *			pi_full == 0		: failback to this if enabled
2470Sstevel@tonic-gate  * ---------------------------------------------------------------------------
2480Sstevel@tonic-gate  *
2490Sstevel@tonic-gate  * Assertions: (Read '==>' as implies)
2500Sstevel@tonic-gate  *
2510Sstevel@tonic-gate  * (pi_empty == 1) ==> (I == 1 || pi_state == PI_FAILED)
2520Sstevel@tonic-gate  * (pi_empty == 1) ==> (pi_full == 0)
2530Sstevel@tonic-gate  * (pi_full  == 1) ==> (pi_empty == 0)
2540Sstevel@tonic-gate  *
2550Sstevel@tonic-gate  * Invariants
2560Sstevel@tonic-gate  *
2570Sstevel@tonic-gate  * pg_groupfailed = 0  &&
258704Sethindra  *   1. (I == 1, pi_empty == 0)		   ==> initiate failover from standby
2590Sstevel@tonic-gate  *   2. (I == 0, PI_FAILED, pi_empty == 0) ==> initiate failover from phyint
2600Sstevel@tonic-gate  *   3. (I == 0, PI_RUNNING, pi_full == 0) ==> initiate failback to phyint
2610Sstevel@tonic-gate  *
2620Sstevel@tonic-gate  * 1. says that an inactive standby, that is not empty, has to be failed
2630Sstevel@tonic-gate  * over. For a standby to be truly inactive, it should not host any
2640Sstevel@tonic-gate  * addresses. So we move them to some other phyint. Usually we catch the
2650Sstevel@tonic-gate  * turn on of IFF_INACTIVE, and perform this action. However if the failover
2660Sstevel@tonic-gate  * did not complete successfully, then subsequently we have lost the edge
2670Sstevel@tonic-gate  * trigger, and this invariant kicks in and completes the action.
2680Sstevel@tonic-gate  *
2690Sstevel@tonic-gate  * 2. says that any failed phyint that is not empty must be failed over.
2700Sstevel@tonic-gate  * Usually we do the failover when we detect NIC failure. However if the
2710Sstevel@tonic-gate  * failover does not complete successfully, this invariant kicks in and
2720Sstevel@tonic-gate  * completes the failover. We exclude inactive standby which is covered by 1.
2730Sstevel@tonic-gate  *
2740Sstevel@tonic-gate  * 3. says that any running phyint that is not full must be failed back.
2750Sstevel@tonic-gate  * Usually we do the failback when we detect NIC repair. However if the
2760Sstevel@tonic-gate  * failback does not complete successfully, this invariant kicks in and
2770Sstevel@tonic-gate  * completes the failback. Note that we don't want to failback to an inactive
2780Sstevel@tonic-gate  * standby.
2790Sstevel@tonic-gate  *
2800Sstevel@tonic-gate  * The invariants 1 - 3 and the actions are in initifs().
2810Sstevel@tonic-gate  */
2820Sstevel@tonic-gate 
2830Sstevel@tonic-gate struct probes_missed probes_missed;
2840Sstevel@tonic-gate 
2850Sstevel@tonic-gate /*
2860Sstevel@tonic-gate  * Compose and transmit an ICMP ECHO REQUEST packet.  The IP header
2870Sstevel@tonic-gate  * will be added on by the kernel.  The id field identifies this phyint.
2880Sstevel@tonic-gate  * and the sequence number is an increasing (modulo 2^^16) integer. The data
2890Sstevel@tonic-gate  * portion holds the time value when the packet is sent. On echo this is
2900Sstevel@tonic-gate  * extracted to compute the round-trip time. Three different types of
2910Sstevel@tonic-gate  * probe packets are used.
2920Sstevel@tonic-gate  *
2930Sstevel@tonic-gate  * PROBE_UNI: This type is used to do failure detection / failure recovery
2940Sstevel@tonic-gate  *	and RTT calculation. PROBE_UNI probes are spaced apart in time,
2950Sstevel@tonic-gate  *	not less than the current CRTT. pii_probes[] stores data
2960Sstevel@tonic-gate  *	about these probes. These packets consume sequence number space.
2970Sstevel@tonic-gate  *
2980Sstevel@tonic-gate  * PROBE_RTT: This type is used to make only rtt measurments. Normally these
2990Sstevel@tonic-gate  * 	are not used. Under heavy network load, the rtt may go up very high,
3000Sstevel@tonic-gate  *	due to a spike, or may appear to go high, due to extreme scheduling
3010Sstevel@tonic-gate  * 	delays. Once the network stress is removed, mpathd takes long time to
3020Sstevel@tonic-gate  *	recover, because the probe_interval is already high, and it takes
3030Sstevel@tonic-gate  *	a long time to send out sufficient number of probes to bring down the
3040Sstevel@tonic-gate  *	rtt. To avoid this problem, PROBE_RTT probes are sent out every
3050Sstevel@tonic-gate  *	user_probe_interval ms. and will cause only rtt updates. These packets
3060Sstevel@tonic-gate  *	do not consume sequence number space nor is information about these
3070Sstevel@tonic-gate  *	packets stored in the pii_probes[]
3080Sstevel@tonic-gate  *
3090Sstevel@tonic-gate  * PROBE_MULTI: This type is only used to construct a list of targets, when
3100Sstevel@tonic-gate  *	no targets are known. The packet is multicast to the all hosts addr.
3110Sstevel@tonic-gate  */
3120Sstevel@tonic-gate static void
3130Sstevel@tonic-gate probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time)
3140Sstevel@tonic-gate {
3150Sstevel@tonic-gate 	struct pr_icmp probe_pkt;	/* Probe packet */
3160Sstevel@tonic-gate 	struct sockaddr_in6 whereto6; 	/* target address IPv6 */
3170Sstevel@tonic-gate 	struct sockaddr_in whereto; 	/* target address IPv4 */
3180Sstevel@tonic-gate 	int	pr_ndx;			/* probe index in pii->pii_probes[] */
3190Sstevel@tonic-gate 	boolean_t sent = _B_TRUE;
3200Sstevel@tonic-gate 
3210Sstevel@tonic-gate 	if (debug & D_TARGET) {
3220Sstevel@tonic-gate 		logdebug("probe(%s %s %d %u)\n", AF_STR(pii->pii_af),
3230Sstevel@tonic-gate 		    pii->pii_name, probe_type, cur_time);
3240Sstevel@tonic-gate 	}
3250Sstevel@tonic-gate 
3260Sstevel@tonic-gate 	assert(pii->pii_probe_sock != -1);
3270Sstevel@tonic-gate 	assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI ||
3280Sstevel@tonic-gate 	    probe_type == PROBE_RTT);
3290Sstevel@tonic-gate 
3300Sstevel@tonic-gate 	probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ?
3310Sstevel@tonic-gate 	    ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST;
3320Sstevel@tonic-gate 	probe_pkt.pr_icmp_code = 0;
3330Sstevel@tonic-gate 	probe_pkt.pr_icmp_cksum = 0;
3340Sstevel@tonic-gate 	probe_pkt.pr_icmp_seq = htons(pii->pii_snxt);
3350Sstevel@tonic-gate 
3360Sstevel@tonic-gate 	/*
3370Sstevel@tonic-gate 	 * Since there is no need to do arithmetic on the icmpid,
3380Sstevel@tonic-gate 	 * (only equality check is done) pii_icmpid is stored in
3390Sstevel@tonic-gate 	 * network byte order at initialization itself.
3400Sstevel@tonic-gate 	 */
3410Sstevel@tonic-gate 	probe_pkt.pr_icmp_id = pii->pii_icmpid;
3420Sstevel@tonic-gate 	probe_pkt.pr_icmp_timestamp = htonl(cur_time);
3430Sstevel@tonic-gate 	probe_pkt.pr_icmp_mtype = htonl(probe_type);
3440Sstevel@tonic-gate 
3450Sstevel@tonic-gate 	/*
3460Sstevel@tonic-gate 	 * If probe_type is PROBE_MULTI, this packet will be multicast to
3470Sstevel@tonic-gate 	 * the all hosts address. Otherwise it is unicast to the next target.
3480Sstevel@tonic-gate 	 */
3490Sstevel@tonic-gate 	assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) &&
3500Sstevel@tonic-gate 	    pii->pii_rtt_target_next != NULL));
3510Sstevel@tonic-gate 
3520Sstevel@tonic-gate 	if (pii->pii_af == AF_INET6) {
3530Sstevel@tonic-gate 		bzero(&whereto6, sizeof (whereto6));
3540Sstevel@tonic-gate 		whereto6.sin6_family = AF_INET6;
3550Sstevel@tonic-gate 		if (probe_type == PROBE_MULTI) {
3560Sstevel@tonic-gate 			whereto6.sin6_addr = all_nodes_mcast_v6;
3570Sstevel@tonic-gate 		} else if (probe_type == PROBE_UNI) {
3580Sstevel@tonic-gate 			whereto6.sin6_addr = pii->pii_target_next->tg_address;
3590Sstevel@tonic-gate 		} else  {
3600Sstevel@tonic-gate 			/* type is PROBE_RTT */
3610Sstevel@tonic-gate 			whereto6.sin6_addr =
3620Sstevel@tonic-gate 			    pii->pii_rtt_target_next->tg_address;
3630Sstevel@tonic-gate 		}
3640Sstevel@tonic-gate 		if (sendto(pii->pii_probe_sock, (char *)&probe_pkt,
3650Sstevel@tonic-gate 		    sizeof (probe_pkt), 0, (struct sockaddr *)&whereto6,
3660Sstevel@tonic-gate 		    sizeof (whereto6)) != sizeof (probe_pkt)) {
3670Sstevel@tonic-gate 			logperror_pii(pii, "probe: probe sendto");
3680Sstevel@tonic-gate 			sent = _B_FALSE;
3690Sstevel@tonic-gate 		}
3700Sstevel@tonic-gate 	} else {
3710Sstevel@tonic-gate 		bzero(&whereto, sizeof (whereto));
3720Sstevel@tonic-gate 		whereto.sin_family = AF_INET;
3730Sstevel@tonic-gate 		if (probe_type == PROBE_MULTI) {
3740Sstevel@tonic-gate 			whereto.sin_addr = all_nodes_mcast_v4;
3750Sstevel@tonic-gate 		} else if (probe_type == PROBE_UNI) {
3760Sstevel@tonic-gate 			IN6_V4MAPPED_TO_INADDR(
3770Sstevel@tonic-gate 			    &pii->pii_target_next->tg_address,
3780Sstevel@tonic-gate 			    &whereto.sin_addr);
3790Sstevel@tonic-gate 		} else {
3800Sstevel@tonic-gate 			/* type is PROBE_RTT */
3810Sstevel@tonic-gate 			IN6_V4MAPPED_TO_INADDR(
3820Sstevel@tonic-gate 			    &pii->pii_rtt_target_next->tg_address,
3830Sstevel@tonic-gate 			    &whereto.sin_addr);
3840Sstevel@tonic-gate 		}
3850Sstevel@tonic-gate 
3860Sstevel@tonic-gate 		/*
3870Sstevel@tonic-gate 		 * Compute the IPv4 icmp checksum. Does not cover the IP header.
3880Sstevel@tonic-gate 		 */
3890Sstevel@tonic-gate 		probe_pkt.pr_icmp_cksum =
3900Sstevel@tonic-gate 		    in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt));
3910Sstevel@tonic-gate 		if (sendto(pii->pii_probe_sock, (char *)&probe_pkt,
3920Sstevel@tonic-gate 		    sizeof (probe_pkt), 0, (struct sockaddr *)&whereto,
3930Sstevel@tonic-gate 		    sizeof (whereto)) != sizeof (probe_pkt)) {
3940Sstevel@tonic-gate 			logperror_pii(pii, "probe: probe sendto");
3950Sstevel@tonic-gate 			sent = _B_FALSE;
3960Sstevel@tonic-gate 		}
3970Sstevel@tonic-gate 	}
3980Sstevel@tonic-gate 
3990Sstevel@tonic-gate 	/*
4000Sstevel@tonic-gate 	 * If this is a PROBE_UNI probe packet being unicast to a target, then
4010Sstevel@tonic-gate 	 * update our tables. We will need this info in processing the probe
4020Sstevel@tonic-gate 	 * response. PROBE_MULTI and PROBE_RTT packets are not used for
4030Sstevel@tonic-gate 	 * the purpose of failure or recovery detection. PROBE_MULTI packets
4040Sstevel@tonic-gate 	 * are only used to construct a list of targets. PROBE_RTT packets are
4050Sstevel@tonic-gate 	 * used only for updating the rtt and not for failure detection.
4060Sstevel@tonic-gate 	 */
4070Sstevel@tonic-gate 	if (probe_type == PROBE_UNI && sent) {
4080Sstevel@tonic-gate 		pr_ndx = pii->pii_probe_next;
4090Sstevel@tonic-gate 		assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT);
4100Sstevel@tonic-gate 
4110Sstevel@tonic-gate 		/* Collect statistics, before we reuse the last slot. */
4120Sstevel@tonic-gate 		if (pii->pii_probes[pr_ndx].pr_status == PR_LOST)
4130Sstevel@tonic-gate 			pii->pii_cum_stats.lost++;
4140Sstevel@tonic-gate 		else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED)
4150Sstevel@tonic-gate 			pii->pii_cum_stats.acked++;
4160Sstevel@tonic-gate 		pii->pii_cum_stats.sent++;
4170Sstevel@tonic-gate 
4180Sstevel@tonic-gate 		pii->pii_probes[pr_ndx].pr_status = PR_UNACKED;
4190Sstevel@tonic-gate 		pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next;
4200Sstevel@tonic-gate 		pii->pii_probes[pr_ndx].pr_time_sent = cur_time;
4210Sstevel@tonic-gate 		pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next);
4220Sstevel@tonic-gate 		pii->pii_target_next = target_next(pii->pii_target_next);
4230Sstevel@tonic-gate 		assert(pii->pii_target_next != NULL);
4240Sstevel@tonic-gate 		/*
4250Sstevel@tonic-gate 		 * If we have a single variable to denote the next target to
4260Sstevel@tonic-gate 		 * probe for both rtt probes and failure detection probes, we
4270Sstevel@tonic-gate 		 * could end up with a situation where the failure detection
4280Sstevel@tonic-gate 		 * probe targets become disjoint from the rtt probe targets.
4290Sstevel@tonic-gate 		 * Eg. if 2 targets and the actual fdt is double the user
4300Sstevel@tonic-gate 		 * specified fdt. So we have 2 variables. In this scheme
4310Sstevel@tonic-gate 		 * we also reset pii_rtt_target_next for every fdt probe,
4320Sstevel@tonic-gate 		 * though that may not be necessary.
4330Sstevel@tonic-gate 		 */
4340Sstevel@tonic-gate 		pii->pii_rtt_target_next = pii->pii_target_next;
4350Sstevel@tonic-gate 		pii->pii_snxt++;
4360Sstevel@tonic-gate 	} else if (probe_type == PROBE_RTT) {
4370Sstevel@tonic-gate 		pii->pii_rtt_target_next =
4380Sstevel@tonic-gate 		    target_next(pii->pii_rtt_target_next);
4390Sstevel@tonic-gate 		assert(pii->pii_rtt_target_next != NULL);
4400Sstevel@tonic-gate 	}
4410Sstevel@tonic-gate }
4420Sstevel@tonic-gate 
4430Sstevel@tonic-gate /*
4440Sstevel@tonic-gate  * Incoming IPv4 data from wire, is received here. Called from main.
4450Sstevel@tonic-gate  */
4460Sstevel@tonic-gate void
4470Sstevel@tonic-gate in_data(struct phyint_instance *pii)
4480Sstevel@tonic-gate {
4490Sstevel@tonic-gate 	struct	sockaddr_in 	from;
4500Sstevel@tonic-gate 	struct	in6_addr	fromaddr;
4510Sstevel@tonic-gate 	uint_t	fromlen;
4520Sstevel@tonic-gate 	static uint_t in_packet[(IP_MAXPACKET + 1)/4];
4530Sstevel@tonic-gate 	struct ip *ip;
4540Sstevel@tonic-gate 	int 	iphlen;
4550Sstevel@tonic-gate 	int 	len;
4560Sstevel@tonic-gate 	char 	abuf[INET_ADDRSTRLEN];
4570Sstevel@tonic-gate 	struct	pr_icmp	*reply;
4580Sstevel@tonic-gate 
4590Sstevel@tonic-gate 	if (debug & D_PROBE) {
4600Sstevel@tonic-gate 		logdebug("in_data(%s %s)\n",
4610Sstevel@tonic-gate 		    AF_STR(pii->pii_af), pii->pii_name);
4620Sstevel@tonic-gate 	}
4630Sstevel@tonic-gate 
4640Sstevel@tonic-gate 	/*
4650Sstevel@tonic-gate 	 * Poll has already told us that a message is waiting,
4660Sstevel@tonic-gate 	 * on this socket. Read it now. We should not block.
4670Sstevel@tonic-gate 	 */
4680Sstevel@tonic-gate 	fromlen = sizeof (from);
4690Sstevel@tonic-gate 	len = recvfrom(pii->pii_probe_sock, (char *)in_packet,
4700Sstevel@tonic-gate 	    sizeof (in_packet), 0, (struct sockaddr *)&from, &fromlen);
4710Sstevel@tonic-gate 	if (len < 0) {
4720Sstevel@tonic-gate 		logperror_pii(pii, "in_data: recvfrom");
4730Sstevel@tonic-gate 		return;
4740Sstevel@tonic-gate 	}
4750Sstevel@tonic-gate 
4760Sstevel@tonic-gate 	/*
4770Sstevel@tonic-gate 	 * If the NIC has indicated the link is down, don't go
4780Sstevel@tonic-gate 	 * any further.
4790Sstevel@tonic-gate 	 */
4800Sstevel@tonic-gate 	if (LINK_DOWN(pii->pii_phyint))
4810Sstevel@tonic-gate 		return;
4820Sstevel@tonic-gate 
4830Sstevel@tonic-gate 	/* Get the printable address for error reporting */
4840Sstevel@tonic-gate 	(void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf));
4850Sstevel@tonic-gate 
4860Sstevel@tonic-gate 	/* Make sure packet contains at least minimum ICMP header */
4870Sstevel@tonic-gate 	ip = (struct ip *)in_packet;
4880Sstevel@tonic-gate 	iphlen = ip->ip_hl << 2;
4890Sstevel@tonic-gate 	if (len < iphlen + ICMP_MINLEN) {
4900Sstevel@tonic-gate 		if (debug & D_PKTBAD) {
4910Sstevel@tonic-gate 			logdebug("in_data: packet too short (%d bytes)"
4920Sstevel@tonic-gate 			    " from %s\n", len, abuf);
4930Sstevel@tonic-gate 		}
4940Sstevel@tonic-gate 		return;
4950Sstevel@tonic-gate 	}
4960Sstevel@tonic-gate 
4970Sstevel@tonic-gate 	/*
4980Sstevel@tonic-gate 	 * Subtract the IP hdr length, 'len' will be length of the probe
4990Sstevel@tonic-gate 	 * reply, starting from the icmp hdr.
5000Sstevel@tonic-gate 	 */
5010Sstevel@tonic-gate 	len -= iphlen;
5020Sstevel@tonic-gate 	/* LINTED */
5030Sstevel@tonic-gate 	reply = (struct pr_icmp *)((char *)in_packet + iphlen);
5040Sstevel@tonic-gate 
5050Sstevel@tonic-gate 	/* Probe replies are icmp echo replies. Ignore anything else */
5060Sstevel@tonic-gate 	if (reply->pr_icmp_type != ICMP_ECHO_REPLY)
5070Sstevel@tonic-gate 		return;
5080Sstevel@tonic-gate 
5090Sstevel@tonic-gate 	/*
5100Sstevel@tonic-gate 	 * The icmp id should match what we sent, which is stored
5110Sstevel@tonic-gate 	 * in pi_icmpid. The icmp code for reply must be 0.
5120Sstevel@tonic-gate 	 * The reply content must be a struct pr_icmp
5130Sstevel@tonic-gate 	 */
5140Sstevel@tonic-gate 	if (reply->pr_icmp_id != pii->pii_icmpid) {
5150Sstevel@tonic-gate 		/* Not in response to our probe */
5160Sstevel@tonic-gate 		return;
5170Sstevel@tonic-gate 	}
5180Sstevel@tonic-gate 
5190Sstevel@tonic-gate 	if (reply->pr_icmp_code != 0) {
5200Sstevel@tonic-gate 		logtrace("probe reply code %d from %s on %s\n",
5210Sstevel@tonic-gate 		    reply->pr_icmp_code, abuf, pii->pii_name);
5220Sstevel@tonic-gate 		return;
5230Sstevel@tonic-gate 	}
5240Sstevel@tonic-gate 
5250Sstevel@tonic-gate 	if (len < sizeof (struct pr_icmp)) {
5260Sstevel@tonic-gate 		logtrace("probe reply too short: %d bytes from %s on %s\n",
5270Sstevel@tonic-gate 		    len, abuf, pii->pii_name);
5280Sstevel@tonic-gate 		return;
5290Sstevel@tonic-gate 	}
5300Sstevel@tonic-gate 
5310Sstevel@tonic-gate 	IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr);
5320Sstevel@tonic-gate 	if (reply->pr_icmp_mtype == htonl(PROBE_UNI))
5330Sstevel@tonic-gate 		/* Unicast probe reply */
5340Sstevel@tonic-gate 		incoming_echo_reply(pii, reply, fromaddr);
5350Sstevel@tonic-gate 	else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
5360Sstevel@tonic-gate 		/* Multicast reply */
5370Sstevel@tonic-gate 		incoming_mcast_reply(pii, reply, fromaddr);
5380Sstevel@tonic-gate 	} else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
5390Sstevel@tonic-gate 		incoming_rtt_reply(pii, reply, fromaddr);
5400Sstevel@tonic-gate 	} else {
5410Sstevel@tonic-gate 		/* Probably not in response to our probe */
5420Sstevel@tonic-gate 		logtrace("probe reply type: %d from %s on %s\n",
5430Sstevel@tonic-gate 		    reply->pr_icmp_mtype, abuf, pii->pii_name);
5440Sstevel@tonic-gate 		return;
5450Sstevel@tonic-gate 	}
5460Sstevel@tonic-gate 
5470Sstevel@tonic-gate }
5480Sstevel@tonic-gate 
5490Sstevel@tonic-gate /*
5500Sstevel@tonic-gate  * Incoming IPv6 data from wire is received here. Called from main.
5510Sstevel@tonic-gate  */
5520Sstevel@tonic-gate void
5530Sstevel@tonic-gate in6_data(struct phyint_instance *pii)
5540Sstevel@tonic-gate {
5550Sstevel@tonic-gate 	struct sockaddr_in6 from;
5560Sstevel@tonic-gate 	static uint64_t in_packet[(IP_MAXPACKET + 1)/8];
5570Sstevel@tonic-gate 	static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8];
5580Sstevel@tonic-gate 	int len;
5590Sstevel@tonic-gate 	char abuf[INET6_ADDRSTRLEN];
5600Sstevel@tonic-gate 	struct msghdr msg;
5610Sstevel@tonic-gate 	struct iovec iov;
5620Sstevel@tonic-gate 	uchar_t *opt;
5630Sstevel@tonic-gate 	struct	pr_icmp *reply;
5640Sstevel@tonic-gate 
5650Sstevel@tonic-gate 	if (debug & D_PROBE) {
5660Sstevel@tonic-gate 		logdebug("in6_data(%s %s)\n",
5670Sstevel@tonic-gate 		    AF_STR(pii->pii_af), pii->pii_name);
5680Sstevel@tonic-gate 	}
5690Sstevel@tonic-gate 
5700Sstevel@tonic-gate 	iov.iov_base = (char *)in_packet;
5710Sstevel@tonic-gate 	iov.iov_len = sizeof (in_packet);
5720Sstevel@tonic-gate 	msg.msg_iov = &iov;
5730Sstevel@tonic-gate 	msg.msg_iovlen = 1;
5740Sstevel@tonic-gate 	msg.msg_name = (struct sockaddr *)&from;
5750Sstevel@tonic-gate 	msg.msg_namelen = sizeof (from);
5760Sstevel@tonic-gate 	msg.msg_control = ancillary_data;
5770Sstevel@tonic-gate 	msg.msg_controllen = sizeof (ancillary_data);
5780Sstevel@tonic-gate 
5790Sstevel@tonic-gate 	if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) {
5800Sstevel@tonic-gate 		logperror_pii(pii, "in6_data: recvfrom");
5810Sstevel@tonic-gate 		return;
5820Sstevel@tonic-gate 	}
5830Sstevel@tonic-gate 
5840Sstevel@tonic-gate 	/*
5850Sstevel@tonic-gate 	 * If the NIC has indicated that the link is down, don't go
5860Sstevel@tonic-gate 	 * any further.
5870Sstevel@tonic-gate 	 */
5880Sstevel@tonic-gate 	if (LINK_DOWN(pii->pii_phyint))
5890Sstevel@tonic-gate 		return;
5900Sstevel@tonic-gate 
5910Sstevel@tonic-gate 	/* Get the printable address for error reporting */
5920Sstevel@tonic-gate 	(void) inet_ntop(AF_INET6, &from.sin6_addr, abuf, sizeof (abuf));
5930Sstevel@tonic-gate 	if (len < ICMP_MINLEN) {
5940Sstevel@tonic-gate 		if (debug & D_PKTBAD) {
5950Sstevel@tonic-gate 			logdebug("Truncated message: msg_flags 0x%x from %s\n",
5960Sstevel@tonic-gate 			    msg.msg_flags, abuf);
5970Sstevel@tonic-gate 		}
5980Sstevel@tonic-gate 		return;
5990Sstevel@tonic-gate 	}
6000Sstevel@tonic-gate 	/* Ignore packets > 64k or control buffers that don't fit */
6010Sstevel@tonic-gate 	if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) {
6020Sstevel@tonic-gate 		if (debug & D_PKTBAD) {
6030Sstevel@tonic-gate 			logdebug("Truncated message: msg_flags 0x%x from %s\n",
6040Sstevel@tonic-gate 			    msg.msg_flags, abuf);
6050Sstevel@tonic-gate 		}
6060Sstevel@tonic-gate 		return;
6070Sstevel@tonic-gate 	}
6080Sstevel@tonic-gate 
6090Sstevel@tonic-gate 	reply = (struct pr_icmp *)in_packet;
6100Sstevel@tonic-gate 	if (reply->pr_icmp_type != ICMP6_ECHO_REPLY)
6110Sstevel@tonic-gate 		return;
6120Sstevel@tonic-gate 
6130Sstevel@tonic-gate 	if (reply->pr_icmp_id != pii->pii_icmpid) {
6140Sstevel@tonic-gate 		/* Not in response to our probe */
6150Sstevel@tonic-gate 		return;
6160Sstevel@tonic-gate 	}
6170Sstevel@tonic-gate 
6180Sstevel@tonic-gate 	/*
6190Sstevel@tonic-gate 	 * The kernel has already verified the the ICMP checksum.
6200Sstevel@tonic-gate 	 */
6210Sstevel@tonic-gate 	if (!IN6_IS_ADDR_LINKLOCAL(&from.sin6_addr)) {
6220Sstevel@tonic-gate 		logtrace("ICMPv6 echo reply source address not linklocal from "
6230Sstevel@tonic-gate 		    "%s on %s\n", abuf, pii->pii_name);
6240Sstevel@tonic-gate 		return;
6250Sstevel@tonic-gate 	}
6260Sstevel@tonic-gate 	opt = find_ancillary(&msg, IPV6_RTHDR);
6270Sstevel@tonic-gate 	if (opt != NULL) {
6280Sstevel@tonic-gate 		/* Can't allow routing headers in probe replies  */
6290Sstevel@tonic-gate 		logtrace("message with routing header from %s on %s\n",
6300Sstevel@tonic-gate 		    abuf, pii->pii_name);
6310Sstevel@tonic-gate 		return;
6320Sstevel@tonic-gate 	}
6330Sstevel@tonic-gate 	if (reply->pr_icmp_code != 0) {
6340Sstevel@tonic-gate 		logtrace("probe reply code: %d from %s on %s\n",
6350Sstevel@tonic-gate 		    reply->pr_icmp_code, abuf, pii->pii_name);
6360Sstevel@tonic-gate 		return;
6370Sstevel@tonic-gate 	}
6380Sstevel@tonic-gate 	if (len < (sizeof (struct pr_icmp))) {
6390Sstevel@tonic-gate 		logtrace("probe reply too short: %d bytes from %s on %s\n",
6400Sstevel@tonic-gate 		    len, abuf, pii->pii_name);
6410Sstevel@tonic-gate 		return;
6420Sstevel@tonic-gate 	}
6430Sstevel@tonic-gate 	if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) {
6440Sstevel@tonic-gate 		incoming_echo_reply(pii, reply, from.sin6_addr);
6450Sstevel@tonic-gate 	} else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
6460Sstevel@tonic-gate 		incoming_mcast_reply(pii, reply, from.sin6_addr);
6470Sstevel@tonic-gate 	} else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
6480Sstevel@tonic-gate 		incoming_rtt_reply(pii, reply, from.sin6_addr);
6490Sstevel@tonic-gate 	} else  {
6500Sstevel@tonic-gate 		/* Probably not in response to our probe */
6510Sstevel@tonic-gate 		logtrace("probe reply type: %d from %s on %s\n",
6520Sstevel@tonic-gate 		    reply->pr_icmp_mtype, abuf, pii->pii_name);
6530Sstevel@tonic-gate 	}
6540Sstevel@tonic-gate }
6550Sstevel@tonic-gate 
6560Sstevel@tonic-gate /*
6570Sstevel@tonic-gate  * Process the incoming rtt reply, in response to our rtt probe.
6580Sstevel@tonic-gate  * Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't
6590Sstevel@tonic-gate  * have any stored information about the probe we sent. So we don't log
6600Sstevel@tonic-gate  * any errors if we receive bad replies.
6610Sstevel@tonic-gate  */
6620Sstevel@tonic-gate static void
6630Sstevel@tonic-gate incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply,
6640Sstevel@tonic-gate     struct in6_addr fromaddr)
6650Sstevel@tonic-gate {
6660Sstevel@tonic-gate 	int 	m;		/* rtt measurment in ms */
6670Sstevel@tonic-gate 	uint32_t cur_time;	/* in ms from some arbitrary point */
6680Sstevel@tonic-gate 	char	abuf[INET6_ADDRSTRLEN];
6690Sstevel@tonic-gate 	struct	target	*target;
6700Sstevel@tonic-gate 	uint32_t pr_icmp_timestamp;
6710Sstevel@tonic-gate 	struct 	phyint_group *pg;
6720Sstevel@tonic-gate 
6730Sstevel@tonic-gate 	/* Get the printable address for error reporting */
6740Sstevel@tonic-gate 	(void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
6750Sstevel@tonic-gate 
6760Sstevel@tonic-gate 	if (debug & D_PROBE) {
6770Sstevel@tonic-gate 		logdebug("incoming_rtt_reply: %s %s %s\n",
6780Sstevel@tonic-gate 		    AF_STR(pii->pii_af), pii->pii_name, abuf);
6790Sstevel@tonic-gate 	}
6800Sstevel@tonic-gate 
6810Sstevel@tonic-gate 	/* Do we know this target ? */
6820Sstevel@tonic-gate 	target = target_lookup(pii, fromaddr);
6830Sstevel@tonic-gate 	if (target == NULL)
6840Sstevel@tonic-gate 		return;
6850Sstevel@tonic-gate 
6860Sstevel@tonic-gate 	pr_icmp_timestamp  = ntohl(reply->pr_icmp_timestamp);
6870Sstevel@tonic-gate 	cur_time = getcurrenttime();
6880Sstevel@tonic-gate 	m = (int)(cur_time - pr_icmp_timestamp);
6890Sstevel@tonic-gate 
6900Sstevel@tonic-gate 	/* Invalid rtt. It has wrapped around */
6910Sstevel@tonic-gate 	if (m < 0)
6920Sstevel@tonic-gate 		return;
6930Sstevel@tonic-gate 
6940Sstevel@tonic-gate 	/*
6950Sstevel@tonic-gate 	 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
6960Sstevel@tonic-gate 	 * The initial few responses after the interface is repaired may
6970Sstevel@tonic-gate 	 * contain high rtt's because they could have been queued up waiting
6980Sstevel@tonic-gate 	 * for ARP/NDP resolution on a failed interface.
6990Sstevel@tonic-gate 	 */
7000Sstevel@tonic-gate 	pg = pii->pii_phyint->pi_group;
7010Sstevel@tonic-gate 	if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
7020Sstevel@tonic-gate 		return;
7030Sstevel@tonic-gate 
7040Sstevel@tonic-gate 	/*
7050Sstevel@tonic-gate 	 * Update rtt only if the new rtt is lower than the current rtt.
7060Sstevel@tonic-gate 	 * (specified by the 3rd parameter to pi_set_crtt).
7070Sstevel@tonic-gate 	 * If a spike has caused the current probe_interval to be >
7080Sstevel@tonic-gate 	 * user_probe_interval, then this mechanism is used to bring down
7090Sstevel@tonic-gate 	 * the rtt rapidly once the network stress is removed.
7100Sstevel@tonic-gate 	 * If the new rtt is higher than the current rtt, we don't want to
7110Sstevel@tonic-gate 	 * update the rtt. We are having more than 1 outstanding probe and
7120Sstevel@tonic-gate 	 * the increase in rtt we are seeing is being unnecessarily weighted
7130Sstevel@tonic-gate 	 * many times. The regular rtt update will be handled by
7140Sstevel@tonic-gate 	 * incoming_echo_reply() and will take care of any rtt increase.
7150Sstevel@tonic-gate 	 */
7160Sstevel@tonic-gate 	pi_set_crtt(target, m, _B_FALSE);
7170Sstevel@tonic-gate 	if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
7180Sstevel@tonic-gate 	    (user_failure_detection_time < pg->pg_fdt) &&
7190Sstevel@tonic-gate 	    (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
7200Sstevel@tonic-gate 		/*
7210Sstevel@tonic-gate 		 * If the crtt has now dropped by a factor of LOWER_FT_TRIGGER,
7220Sstevel@tonic-gate 		 * investigate if we can improve the failure detection time to
7230Sstevel@tonic-gate 		 * meet whatever the user specified.
7240Sstevel@tonic-gate 		 */
7250Sstevel@tonic-gate 		if (check_pg_crtt_improved(pg)) {
7260Sstevel@tonic-gate 			pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
7270Sstevel@tonic-gate 			    user_failure_detection_time);
7280Sstevel@tonic-gate 			pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
7290Sstevel@tonic-gate 			if (pii->pii_phyint->pi_group != phyint_anongroup) {
7300Sstevel@tonic-gate 				logerr("Improved failure detection time %d ms "
7310Sstevel@tonic-gate 				    "on (%s %s) for group \"%s\"\n",
7320Sstevel@tonic-gate 				    pg->pg_fdt, AF_STR(pii->pii_af),
7330Sstevel@tonic-gate 				    pii->pii_name,
7340Sstevel@tonic-gate 				    pii->pii_phyint->pi_group->pg_name);
7350Sstevel@tonic-gate 			}
7360Sstevel@tonic-gate 			if (user_failure_detection_time == pg->pg_fdt) {
7370Sstevel@tonic-gate 				/* Avoid any truncation or rounding errors */
7380Sstevel@tonic-gate 				pg->pg_probeint = user_probe_interval;
7390Sstevel@tonic-gate 				/*
7400Sstevel@tonic-gate 				 * No more rtt probes will be sent. The actual
7410Sstevel@tonic-gate 				 * fdt has dropped to the user specified value.
7420Sstevel@tonic-gate 				 * pii_fd_snxt_basetime and pii_snxt_basetime
7430Sstevel@tonic-gate 				 * will be in sync henceforth.
7440Sstevel@tonic-gate 				 */
7450Sstevel@tonic-gate 				reset_snxt_basetimes();
7460Sstevel@tonic-gate 			}
7470Sstevel@tonic-gate 		}
7480Sstevel@tonic-gate 	}
7490Sstevel@tonic-gate }
7500Sstevel@tonic-gate 
7510Sstevel@tonic-gate /*
7520Sstevel@tonic-gate  * Process the incoming echo reply, in response to our unicast probe.
7530Sstevel@tonic-gate  * Common for both IPv4 and IPv6
7540Sstevel@tonic-gate  */
7550Sstevel@tonic-gate static void
7560Sstevel@tonic-gate incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply,
7570Sstevel@tonic-gate     struct in6_addr fromaddr)
7580Sstevel@tonic-gate {
7590Sstevel@tonic-gate 	int 	m;		/* rtt measurment in ms */
7600Sstevel@tonic-gate 	uint32_t cur_time;	/* in ms from some arbitrary point */
7610Sstevel@tonic-gate 	char	abuf[INET6_ADDRSTRLEN];
7620Sstevel@tonic-gate 	int	pr_ndx;
7630Sstevel@tonic-gate 	struct	target	*target;
7640Sstevel@tonic-gate 	boolean_t exception;
7650Sstevel@tonic-gate 	uint32_t pr_icmp_timestamp;
7660Sstevel@tonic-gate 	uint16_t pr_icmp_seq;
7670Sstevel@tonic-gate 	struct 	phyint_group *pg = pii->pii_phyint->pi_group;
7680Sstevel@tonic-gate 
7690Sstevel@tonic-gate 	/* Get the printable address for error reporting */
7700Sstevel@tonic-gate 	(void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
7710Sstevel@tonic-gate 
7720Sstevel@tonic-gate 	if (debug & D_PROBE) {
7730Sstevel@tonic-gate 		logdebug("incoming_echo_reply: %s %s %s seq %u\n",
7740Sstevel@tonic-gate 		    AF_STR(pii->pii_af), pii->pii_name, abuf,
7750Sstevel@tonic-gate 		    ntohs(reply->pr_icmp_seq));
7760Sstevel@tonic-gate 	}
7770Sstevel@tonic-gate 
7780Sstevel@tonic-gate 	pr_icmp_timestamp  = ntohl(reply->pr_icmp_timestamp);
7790Sstevel@tonic-gate 	pr_icmp_seq  = ntohs(reply->pr_icmp_seq);
7800Sstevel@tonic-gate 
7810Sstevel@tonic-gate 	/* Reject out of window probe replies */
7820Sstevel@tonic-gate 	if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) ||
7830Sstevel@tonic-gate 	    SEQ_LT(pr_icmp_seq, pii->pii_snxt - PROBE_STATS_COUNT)) {
7840Sstevel@tonic-gate 		logtrace("out of window probe seq %u snxt %u on %s from %s\n",
7850Sstevel@tonic-gate 		    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
7860Sstevel@tonic-gate 		pii->pii_cum_stats.unknown++;
7870Sstevel@tonic-gate 		return;
7880Sstevel@tonic-gate 	}
7890Sstevel@tonic-gate 	cur_time = getcurrenttime();
7900Sstevel@tonic-gate 	m = (int)(cur_time - pr_icmp_timestamp);
7910Sstevel@tonic-gate 	if (m < 0) {
7920Sstevel@tonic-gate 		/*
7930Sstevel@tonic-gate 		 * This is a ridiculously high value of rtt. rtt has wrapped
7940Sstevel@tonic-gate 		 * around. Log a message, and ignore the rtt.
7950Sstevel@tonic-gate 		 */
7960Sstevel@tonic-gate 		logerr("incoming_echo_reply: rtt wraparound cur_time %u reply "
7970Sstevel@tonic-gate 		    "timestamp %u\n", cur_time, pr_icmp_timestamp);
7980Sstevel@tonic-gate 	}
7990Sstevel@tonic-gate 
8000Sstevel@tonic-gate 	/*
8010Sstevel@tonic-gate 	 * Get the probe index pr_ndx corresponding to the received icmp seq.
8020Sstevel@tonic-gate 	 * number in our pii->pii_probes[] array. The icmp sequence number
8030Sstevel@tonic-gate 	 * pii_snxt corresponds to the probe index pii->pii_probe_next
8040Sstevel@tonic-gate 	 */
8050Sstevel@tonic-gate 	pr_ndx = MOD_SUB(pii->pii_probe_next,
8060Sstevel@tonic-gate 	    (uint16_t)(pii->pii_snxt - pr_icmp_seq), PROBE_STATS_COUNT);
8070Sstevel@tonic-gate 
8080Sstevel@tonic-gate 	assert(PR_STATUS_VALID(pii->pii_probes[pr_ndx].pr_status));
8090Sstevel@tonic-gate 
8100Sstevel@tonic-gate 	target = pii->pii_probes[pr_ndx].pr_target;
8110Sstevel@tonic-gate 
8120Sstevel@tonic-gate 	/*
8130Sstevel@tonic-gate 	 * Perform sanity checks, whether this probe reply that we
8140Sstevel@tonic-gate 	 * have received is genuine
8150Sstevel@tonic-gate 	 */
8160Sstevel@tonic-gate 	if (target != NULL) {
8170Sstevel@tonic-gate 		/*
8180Sstevel@tonic-gate 		 * Compare the src. addr of the received ICMP or ICMPv6
8190Sstevel@tonic-gate 		 * probe reply with the target address in our tables.
8200Sstevel@tonic-gate 		 */
8210Sstevel@tonic-gate 		if (!IN6_ARE_ADDR_EQUAL(&target->tg_address, &fromaddr)) {
8220Sstevel@tonic-gate 			/*
8230Sstevel@tonic-gate 			 * We don't have any record of having sent a probe to
8240Sstevel@tonic-gate 			 * this target. This is a fake probe reply. Log an error
8250Sstevel@tonic-gate 			 */
8260Sstevel@tonic-gate 			logtrace("probe status %d Fake probe reply seq %u "
8270Sstevel@tonic-gate 			    "snxt %u on %s from %s\n",
8280Sstevel@tonic-gate 			    pii->pii_probes[pr_ndx].pr_status,
8290Sstevel@tonic-gate 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
8300Sstevel@tonic-gate 			pii->pii_cum_stats.unknown++;
8310Sstevel@tonic-gate 			return;
8320Sstevel@tonic-gate 		} else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
8330Sstevel@tonic-gate 			/*
8340Sstevel@tonic-gate 			 * The address matches, but our tables indicate that
8350Sstevel@tonic-gate 			 * this probe reply has been acked already. So this
8360Sstevel@tonic-gate 			 * is a duplicate probe reply. Log an error
8370Sstevel@tonic-gate 			 */
8380Sstevel@tonic-gate 			logtrace("probe status %d Duplicate probe reply seq %u "
8390Sstevel@tonic-gate 			    "snxt %u on %s from %s\n",
8400Sstevel@tonic-gate 			    pii->pii_probes[pr_ndx].pr_status,
8410Sstevel@tonic-gate 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
8420Sstevel@tonic-gate 			pii->pii_cum_stats.unknown++;
8430Sstevel@tonic-gate 			return;
8440Sstevel@tonic-gate 		}
8450Sstevel@tonic-gate 	} else {
8460Sstevel@tonic-gate 		/*
8470Sstevel@tonic-gate 		 * Target must not be NULL in the PR_UNACKED state
8480Sstevel@tonic-gate 		 */
8490Sstevel@tonic-gate 		assert(pii->pii_probes[pr_ndx].pr_status != PR_UNACKED);
8500Sstevel@tonic-gate 		if (pii->pii_probes[pr_ndx].pr_status == PR_UNUSED) {
8510Sstevel@tonic-gate 			/*
8520Sstevel@tonic-gate 			 * The probe stats slot is unused. So we didn't
8530Sstevel@tonic-gate 			 * send out any probe to this target. This is a fake.
8540Sstevel@tonic-gate 			 * Log an error.
8550Sstevel@tonic-gate 			 */
8560Sstevel@tonic-gate 			logtrace("probe status %d Fake probe reply seq %u "
8570Sstevel@tonic-gate 			    "snxt %u on %s from %s\n",
8580Sstevel@tonic-gate 			    pii->pii_probes[pr_ndx].pr_status,
8590Sstevel@tonic-gate 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
8600Sstevel@tonic-gate 		}
8610Sstevel@tonic-gate 		pii->pii_cum_stats.unknown++;
8620Sstevel@tonic-gate 		return;
8630Sstevel@tonic-gate 	}
8640Sstevel@tonic-gate 
8650Sstevel@tonic-gate 	/*
8660Sstevel@tonic-gate 	 * If the rtt does not appear to be right, don't update the
8670Sstevel@tonic-gate 	 * rtt stats. This can happen if the system dropped into the
8680Sstevel@tonic-gate 	 * debugger, or the system was hung or too busy for a
8690Sstevel@tonic-gate 	 * substantial time that we didn't get a chance to run.
8700Sstevel@tonic-gate 	 */
8710Sstevel@tonic-gate 	if ((m < 0) || (m > PROBE_STATS_COUNT * pg->pg_probeint)) {
8720Sstevel@tonic-gate 		/*
8730Sstevel@tonic-gate 		 * If the probe corresponding to this receieved response
8740Sstevel@tonic-gate 		 * was truly sent 'm' ms. ago, then this response must
8750Sstevel@tonic-gate 		 * have been rejected by the sequence number checks. The
8760Sstevel@tonic-gate 		 * fact that it has passed the sequence number checks
8770Sstevel@tonic-gate 		 * means that the measured rtt is wrong. We were probably
8780Sstevel@tonic-gate 		 * scheduled long after the packet was received.
8790Sstevel@tonic-gate 		 */
8800Sstevel@tonic-gate 		goto out;
8810Sstevel@tonic-gate 	}
8820Sstevel@tonic-gate 
8830Sstevel@tonic-gate 	/*
8840Sstevel@tonic-gate 	 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
8850Sstevel@tonic-gate 	 * The initial few responses after the interface is repaired may
8860Sstevel@tonic-gate 	 * contain high rtt's because they could have been queued up waiting
8870Sstevel@tonic-gate 	 * for ARP/NDP resolution on a failed interface.
8880Sstevel@tonic-gate 	 */
8890Sstevel@tonic-gate 	if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
8900Sstevel@tonic-gate 		goto out;
8910Sstevel@tonic-gate 
8920Sstevel@tonic-gate 	/*
8930Sstevel@tonic-gate 	 * Don't update the Conservative Round Trip Time estimate for this
8940Sstevel@tonic-gate 	 * (phint, target) pair if this is the not the highest ack seq seen
8950Sstevel@tonic-gate 	 * thus far on this target.
8960Sstevel@tonic-gate 	 */
8970Sstevel@tonic-gate 	if (!highest_ack_tg(pr_icmp_seq, target))
8980Sstevel@tonic-gate 		goto out;
8990Sstevel@tonic-gate 
9000Sstevel@tonic-gate 	/*
9010Sstevel@tonic-gate 	 * Always update the rtt. This is a failure detection probe
9020Sstevel@tonic-gate 	 * and we want to measure both increase / decrease in rtt.
9030Sstevel@tonic-gate 	 */
9040Sstevel@tonic-gate 	pi_set_crtt(target, m, _B_TRUE);
9050Sstevel@tonic-gate 
9060Sstevel@tonic-gate 	/*
9070Sstevel@tonic-gate 	 * If the crtt exceeds the average time between probes,
9080Sstevel@tonic-gate 	 * investigate if this slow target is an exception. If so we
9090Sstevel@tonic-gate 	 * can avoid this target and still meet the failure detection
9100Sstevel@tonic-gate 	 * time. Otherwise we can't meet the failure detection time.
9110Sstevel@tonic-gate 	 */
9120Sstevel@tonic-gate 	if (target->tg_crtt > pg->pg_probeint) {
9130Sstevel@tonic-gate 		exception = check_exception_target(pii, target);
9140Sstevel@tonic-gate 		if (exception) {
9150Sstevel@tonic-gate 			/*
9160Sstevel@tonic-gate 			 * This target is exceptionally slow. Don't use it
9170Sstevel@tonic-gate 			 * for future probes. check_exception_target() has
9180Sstevel@tonic-gate 			 * made sure that we have at least MIN_PROBE_TARGETS
9190Sstevel@tonic-gate 			 * other active targets
9200Sstevel@tonic-gate 			 */
9210Sstevel@tonic-gate 			if (pii->pii_targets_are_routers) {
9220Sstevel@tonic-gate 				/*
9230Sstevel@tonic-gate 				 * This is a slow router, mark it as slow
9240Sstevel@tonic-gate 				 * and don't use it for further probes. We
9250Sstevel@tonic-gate 				 * don't delete it, since it will be populated
9260Sstevel@tonic-gate 				 * again when we do a router scan. Hence we
9270Sstevel@tonic-gate 				 * need to maintain extra state (unlike the
9280Sstevel@tonic-gate 				 * host case below).  Mark it as TG_SLOW.
9290Sstevel@tonic-gate 				 */
9300Sstevel@tonic-gate 				if (target->tg_status == TG_ACTIVE)
9310Sstevel@tonic-gate 					pii->pii_ntargets--;
9320Sstevel@tonic-gate 				target->tg_status = TG_SLOW;
9330Sstevel@tonic-gate 				target->tg_latime = gethrtime();
9340Sstevel@tonic-gate 				target->tg_rtt_sa = -1;
9350Sstevel@tonic-gate 				target->tg_crtt = 0;
9360Sstevel@tonic-gate 				target->tg_rtt_sd = 0;
9370Sstevel@tonic-gate 				if (pii->pii_target_next == target) {
9380Sstevel@tonic-gate 					pii->pii_target_next =
9390Sstevel@tonic-gate 					    target_next(target);
9400Sstevel@tonic-gate 				}
9410Sstevel@tonic-gate 			} else {
9420Sstevel@tonic-gate 				/*
9430Sstevel@tonic-gate 				 * the slow target is not a router, we can
9440Sstevel@tonic-gate 				 * just delete it. Send an icmp multicast and
9450Sstevel@tonic-gate 				 * pick the fastest responder that is not
9460Sstevel@tonic-gate 				 * already an active target. target_delete()
9470Sstevel@tonic-gate 				 * adjusts pii->pii_target_next
9480Sstevel@tonic-gate 				 */
9490Sstevel@tonic-gate 				target_delete(target);
9500Sstevel@tonic-gate 				probe(pii, PROBE_MULTI, cur_time);
9510Sstevel@tonic-gate 			}
9520Sstevel@tonic-gate 		} else {
9530Sstevel@tonic-gate 			/*
9540Sstevel@tonic-gate 			 * We can't meet the failure detection time.
9550Sstevel@tonic-gate 			 * Log a message, and update the detection time to
9560Sstevel@tonic-gate 			 * whatever we can achieve.
9570Sstevel@tonic-gate 			 */
9580Sstevel@tonic-gate 			pg->pg_probeint = target->tg_crtt * NEXT_FDT_MULTIPLE;
9590Sstevel@tonic-gate 			pg->pg_fdt = pg->pg_probeint * (NUM_PROBE_FAILS + 2);
9600Sstevel@tonic-gate 			last_fdt_bumpup_time = gethrtime();
9610Sstevel@tonic-gate 			if (pg != phyint_anongroup) {
9620Sstevel@tonic-gate 				logerr("Cannot meet requested failure detection"
9630Sstevel@tonic-gate 				    " time of %d ms on (%s %s) new failure"
9640Sstevel@tonic-gate 				    " detection time for group \"%s\" is %d"
9650Sstevel@tonic-gate 				    " ms\n", user_failure_detection_time,
9660Sstevel@tonic-gate 				    AF_STR(pii->pii_af), pii->pii_name,
9670Sstevel@tonic-gate 				    pg->pg_name, pg->pg_fdt);
9680Sstevel@tonic-gate 			}
9690Sstevel@tonic-gate 		}
9700Sstevel@tonic-gate 	} else if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
9710Sstevel@tonic-gate 	    (user_failure_detection_time < pg->pg_fdt) &&
9720Sstevel@tonic-gate 	    (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
9730Sstevel@tonic-gate 		/*
9740Sstevel@tonic-gate 		 * If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER
9750Sstevel@tonic-gate 		 * investigate if we can improve the failure detection time to
9760Sstevel@tonic-gate 		 * meet whatever the user specified.
9770Sstevel@tonic-gate 		 */
9780Sstevel@tonic-gate 		if (check_pg_crtt_improved(pg)) {
9790Sstevel@tonic-gate 			pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
9800Sstevel@tonic-gate 			    user_failure_detection_time);
9810Sstevel@tonic-gate 			pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
9820Sstevel@tonic-gate 			if (pg != phyint_anongroup) {
9830Sstevel@tonic-gate 				logerr("Improved failure detection time %d ms "
9840Sstevel@tonic-gate 				    "on (%s %s) for group \"%s\"\n", pg->pg_fdt,
9850Sstevel@tonic-gate 				    AF_STR(pii->pii_af), pii->pii_name,
9860Sstevel@tonic-gate 				    pg->pg_name);
9870Sstevel@tonic-gate 			}
9880Sstevel@tonic-gate 			if (user_failure_detection_time == pg->pg_fdt) {
9890Sstevel@tonic-gate 				/* Avoid any truncation or rounding errors */
9900Sstevel@tonic-gate 				pg->pg_probeint = user_probe_interval;
9910Sstevel@tonic-gate 				/*
9920Sstevel@tonic-gate 				 * No more rtt probes will be sent. The actual
9930Sstevel@tonic-gate 				 * fdt has dropped to the user specified value.
9940Sstevel@tonic-gate 				 * pii_fd_snxt_basetime and pii_snxt_basetime
9950Sstevel@tonic-gate 				 * will be in sync henceforth.
9960Sstevel@tonic-gate 				 */
9970Sstevel@tonic-gate 				reset_snxt_basetimes();
9980Sstevel@tonic-gate 			}
9990Sstevel@tonic-gate 		}
10000Sstevel@tonic-gate 	}
10010Sstevel@tonic-gate out:
10020Sstevel@tonic-gate 	pii->pii_probes[pr_ndx].pr_status = PR_ACKED;
10030Sstevel@tonic-gate 	pii->pii_probes[pr_ndx].pr_time_acked = cur_time;
10040Sstevel@tonic-gate 
10050Sstevel@tonic-gate 	/*
10060Sstevel@tonic-gate 	 * Update pii->pii_rack, i.e. the sequence number of the last received
10070Sstevel@tonic-gate 	 * probe response, based on the echo reply we have received now, if
10080Sstevel@tonic-gate 	 * either of the following conditions are satisfied.
10090Sstevel@tonic-gate 	 * a. pii_rack is outside the current receive window of
10100Sstevel@tonic-gate 	 *    [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt).
10110Sstevel@tonic-gate 	 *    This means we have not received probe responses for a
10120Sstevel@tonic-gate 	 *    long time, and the sequence number has wrapped around.
10130Sstevel@tonic-gate 	 * b. pii_rack is within the current receive window and this echo
10140Sstevel@tonic-gate 	 *    reply corresponds to the highest sequence number we have seen
10150Sstevel@tonic-gate 	 *    so far.
10160Sstevel@tonic-gate 	 */
10170Sstevel@tonic-gate 	if (SEQ_GE(pii->pii_rack, pii->pii_snxt) ||
10180Sstevel@tonic-gate 	    SEQ_LT(pii->pii_rack, pii->pii_snxt - PROBE_STATS_COUNT) ||
10190Sstevel@tonic-gate 	    SEQ_GT(pr_icmp_seq, pii->pii_rack)) {
10200Sstevel@tonic-gate 		pii->pii_rack = pr_icmp_seq;
10210Sstevel@tonic-gate 	}
10220Sstevel@tonic-gate }
10230Sstevel@tonic-gate 
10240Sstevel@tonic-gate /*
10250Sstevel@tonic-gate  * Returns true if seq is the highest unacknowledged seq for target tg
10260Sstevel@tonic-gate  * else returns false
10270Sstevel@tonic-gate  */
10280Sstevel@tonic-gate static boolean_t
10290Sstevel@tonic-gate highest_ack_tg(uint16_t seq, struct target *tg)
10300Sstevel@tonic-gate {
10310Sstevel@tonic-gate 	struct phyint_instance *pii;
10320Sstevel@tonic-gate 	int	 pr_ndx;
10330Sstevel@tonic-gate 	uint16_t pr_seq;
10340Sstevel@tonic-gate 
10350Sstevel@tonic-gate 	pii = tg->tg_phyint_inst;
10360Sstevel@tonic-gate 
10370Sstevel@tonic-gate 	/*
10380Sstevel@tonic-gate 	 * Get the seq number of the most recent probe sent so far,
10390Sstevel@tonic-gate 	 * and also get the corresponding probe index in the probe stats
10400Sstevel@tonic-gate 	 * array.
10410Sstevel@tonic-gate 	 */
10420Sstevel@tonic-gate 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
10430Sstevel@tonic-gate 	pr_seq = pii->pii_snxt;
10440Sstevel@tonic-gate 	pr_seq--;
10450Sstevel@tonic-gate 
10460Sstevel@tonic-gate 	/*
10470Sstevel@tonic-gate 	 * Start from the most recent probe and walk back, trying to find
10480Sstevel@tonic-gate 	 * an acked probe corresponding to target tg.
10490Sstevel@tonic-gate 	 */
10500Sstevel@tonic-gate 	for (; pr_ndx != pii->pii_probe_next;
10510Sstevel@tonic-gate 	    pr_ndx = PROBE_INDEX_PREV(pr_ndx), pr_seq--) {
10520Sstevel@tonic-gate 		if (pii->pii_probes[pr_ndx].pr_target == tg &&
10530Sstevel@tonic-gate 		    pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
10540Sstevel@tonic-gate 			if (SEQ_GT(pr_seq, seq))
10550Sstevel@tonic-gate 				return (_B_FALSE);
10560Sstevel@tonic-gate 		}
10570Sstevel@tonic-gate 	}
10580Sstevel@tonic-gate 	return (_B_TRUE);
10590Sstevel@tonic-gate }
10600Sstevel@tonic-gate 
10610Sstevel@tonic-gate /*
10620Sstevel@tonic-gate  * Check whether the crtt for the group has improved by a factor of
10630Sstevel@tonic-gate  * LOWER_FDT_TRIGGER.  Small crtt improvements are ignored to avoid failure
10640Sstevel@tonic-gate  * detection time flapping in the face of small crtt changes.
10650Sstevel@tonic-gate  */
10660Sstevel@tonic-gate static boolean_t
10670Sstevel@tonic-gate check_pg_crtt_improved(struct phyint_group *pg)
10680Sstevel@tonic-gate {
10690Sstevel@tonic-gate 	struct	phyint *pi;
10700Sstevel@tonic-gate 
10710Sstevel@tonic-gate 	if (debug & D_PROBE)
10720Sstevel@tonic-gate 		logdebug("check_pg_crtt_improved()\n");
10730Sstevel@tonic-gate 
10740Sstevel@tonic-gate 	/*
10750Sstevel@tonic-gate 	 * The crtt for the group is only improved if each phyint_instance
10760Sstevel@tonic-gate 	 * for both ipv4 and ipv6 is improved.
10770Sstevel@tonic-gate 	 */
10780Sstevel@tonic-gate 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
10790Sstevel@tonic-gate 		if (!check_pii_crtt_improved(pi->pi_v4) ||
10800Sstevel@tonic-gate 		    !check_pii_crtt_improved(pi->pi_v6))
10810Sstevel@tonic-gate 			return (_B_FALSE);
10820Sstevel@tonic-gate 	}
10830Sstevel@tonic-gate 
10840Sstevel@tonic-gate 	return (_B_TRUE);
10850Sstevel@tonic-gate }
10860Sstevel@tonic-gate 
10870Sstevel@tonic-gate /*
10880Sstevel@tonic-gate  * Check whether the crtt has improved substantially on this phyint_instance.
10890Sstevel@tonic-gate  * Returns _B_TRUE if there's no crtt information available, because pii
10900Sstevel@tonic-gate  * is NULL or the phyint_instance is not capable of probing.
10910Sstevel@tonic-gate  */
10920Sstevel@tonic-gate boolean_t
10930Sstevel@tonic-gate check_pii_crtt_improved(struct phyint_instance *pii) {
10940Sstevel@tonic-gate 	struct 	target *tg;
10950Sstevel@tonic-gate 
10960Sstevel@tonic-gate 	if (pii == NULL)
10970Sstevel@tonic-gate 		return (_B_TRUE);
10980Sstevel@tonic-gate 
10990Sstevel@tonic-gate 	if (!PROBE_CAPABLE(pii) ||
11000Sstevel@tonic-gate 	    pii->pii_phyint->pi_state == PI_FAILED)
11010Sstevel@tonic-gate 		return (_B_TRUE);
11020Sstevel@tonic-gate 
11030Sstevel@tonic-gate 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
11040Sstevel@tonic-gate 		if (tg->tg_status != TG_ACTIVE)
11050Sstevel@tonic-gate 			continue;
11060Sstevel@tonic-gate 		if (tg->tg_crtt > (pii->pii_phyint->pi_group->pg_probeint /
11070Sstevel@tonic-gate 		    LOWER_FDT_TRIGGER)) {
11080Sstevel@tonic-gate 			return (_B_FALSE);
11090Sstevel@tonic-gate 		}
11100Sstevel@tonic-gate 	}
11110Sstevel@tonic-gate 
11120Sstevel@tonic-gate 	return (_B_TRUE);
11130Sstevel@tonic-gate }
11140Sstevel@tonic-gate 
11150Sstevel@tonic-gate /*
11160Sstevel@tonic-gate  * This target responds very slowly to probes. The target's crtt exceeds
11170Sstevel@tonic-gate  * the probe interval of its group. Compare against other targets
11180Sstevel@tonic-gate  * and determine if this target is an exception, if so return true, else false
11190Sstevel@tonic-gate  */
11200Sstevel@tonic-gate static boolean_t
11210Sstevel@tonic-gate check_exception_target(struct phyint_instance *pii, struct target *target)
11220Sstevel@tonic-gate {
11230Sstevel@tonic-gate 	struct	target *tg;
11240Sstevel@tonic-gate 	char abuf[INET6_ADDRSTRLEN];
11250Sstevel@tonic-gate 
11260Sstevel@tonic-gate 	if (debug & D_PROBE) {
11270Sstevel@tonic-gate 		logdebug("check_exception_target(%s %s target %s)\n",
11280Sstevel@tonic-gate 		    AF_STR(pii->pii_af), pii->pii_name,
11290Sstevel@tonic-gate 		    pr_addr(pii->pii_af, target->tg_address,
1130*4929Srk129064 		    abuf, sizeof (abuf)));
11310Sstevel@tonic-gate 	}
11320Sstevel@tonic-gate 
11330Sstevel@tonic-gate 	/*
11340Sstevel@tonic-gate 	 * We should have at least MIN_PROBE_TARGETS + 1 good targets now,
11350Sstevel@tonic-gate 	 * to make a good judgement. Otherwise don't drop this target.
11360Sstevel@tonic-gate 	 */
11370Sstevel@tonic-gate 	if (pii->pii_ntargets <  MIN_PROBE_TARGETS + 1)
11380Sstevel@tonic-gate 		return (_B_FALSE);
11390Sstevel@tonic-gate 
11400Sstevel@tonic-gate 	/*
11410Sstevel@tonic-gate 	 * Determine whether only this particular target is slow.
11420Sstevel@tonic-gate 	 * We know that this target's crtt exceeds the group's probe interval.
11430Sstevel@tonic-gate 	 * If all other active targets have a
11440Sstevel@tonic-gate 	 * crtt < (this group's probe interval) / EXCEPTION_FACTOR,
11450Sstevel@tonic-gate 	 * then this target is considered slow.
11460Sstevel@tonic-gate 	 */
11470Sstevel@tonic-gate 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
11480Sstevel@tonic-gate 		if (tg != target && tg->tg_status == TG_ACTIVE) {
11490Sstevel@tonic-gate 			if (tg->tg_crtt >
11500Sstevel@tonic-gate 			    pii->pii_phyint->pi_group->pg_probeint /
11510Sstevel@tonic-gate 			    EXCEPTION_FACTOR) {
11520Sstevel@tonic-gate 				return (_B_FALSE);
11530Sstevel@tonic-gate 			}
11540Sstevel@tonic-gate 		}
11550Sstevel@tonic-gate 	}
11560Sstevel@tonic-gate 
11570Sstevel@tonic-gate 	return (_B_TRUE);
11580Sstevel@tonic-gate }
11590Sstevel@tonic-gate 
11600Sstevel@tonic-gate /*
11610Sstevel@tonic-gate  * Update the target list. The icmp all hosts multicast has given us
11620Sstevel@tonic-gate  * some host to which we can send probes. If we already have sufficient
11630Sstevel@tonic-gate  * targets, discard it.
11640Sstevel@tonic-gate  */
11650Sstevel@tonic-gate static void
11660Sstevel@tonic-gate incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply,
11670Sstevel@tonic-gate     struct in6_addr fromaddr)
11680Sstevel@tonic-gate /* ARGSUSED */
11690Sstevel@tonic-gate {
11700Sstevel@tonic-gate 	int af;
11710Sstevel@tonic-gate 	char abuf[INET6_ADDRSTRLEN];
11720Sstevel@tonic-gate 	struct phyint *pi;
11730Sstevel@tonic-gate 
11740Sstevel@tonic-gate 	if (debug & D_PROBE) {
11750Sstevel@tonic-gate 		logdebug("incoming_mcast_reply(%s %s %s)\n",
11760Sstevel@tonic-gate 		    AF_STR(pii->pii_af), pii->pii_name,
11770Sstevel@tonic-gate 		    pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)));
11780Sstevel@tonic-gate 	}
11790Sstevel@tonic-gate 
11800Sstevel@tonic-gate 	/*
11810Sstevel@tonic-gate 	 * Using host targets is a fallback mechanism. If we have
11820Sstevel@tonic-gate 	 * found a router, don't add this host target. If we already
11830Sstevel@tonic-gate 	 * know MAX_PROBE_TARGETS, don't add another target.
11840Sstevel@tonic-gate 	 */
11850Sstevel@tonic-gate 	assert(pii->pii_ntargets <= MAX_PROBE_TARGETS);
11860Sstevel@tonic-gate 	if (pii->pii_targets != NULL) {
11870Sstevel@tonic-gate 		if (pii->pii_targets_are_routers ||
11880Sstevel@tonic-gate 		    (pii->pii_ntargets == MAX_PROBE_TARGETS)) {
11890Sstevel@tonic-gate 			return;
11900Sstevel@tonic-gate 		}
11910Sstevel@tonic-gate 	}
11920Sstevel@tonic-gate 
11930Sstevel@tonic-gate 	if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) ||
11940Sstevel@tonic-gate 	    IN6_IS_ADDR_V4MAPPED_ANY(&fromaddr)) {
11950Sstevel@tonic-gate 		/*
11960Sstevel@tonic-gate 		 * Guard against response from 0.0.0.0
11970Sstevel@tonic-gate 		 * and ::. Log a trace message
11980Sstevel@tonic-gate 		 */
11990Sstevel@tonic-gate 		logtrace("probe response from %s on %s\n",
12000Sstevel@tonic-gate 		    pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)),
12010Sstevel@tonic-gate 		    pii->pii_name);
12020Sstevel@tonic-gate 		return;
12030Sstevel@tonic-gate 	}
12040Sstevel@tonic-gate 
12050Sstevel@tonic-gate 	/*
12060Sstevel@tonic-gate 	 * This address is one of our own, so reject this address as a
12070Sstevel@tonic-gate 	 * valid probe target.
12080Sstevel@tonic-gate 	 */
12090Sstevel@tonic-gate 	af = pii->pii_af;
12102250Srk129064 	if (own_address(fromaddr))
12110Sstevel@tonic-gate 		return;
12120Sstevel@tonic-gate 
12130Sstevel@tonic-gate 	/*
12140Sstevel@tonic-gate 	 * If the phyint is part a named group, then add the address to all
12150Sstevel@tonic-gate 	 * members of the group.  Otherwise, add the address only to the
12160Sstevel@tonic-gate 	 * phyint itself, since other phyints in the anongroup may not be on
12170Sstevel@tonic-gate 	 * the same subnet.
12180Sstevel@tonic-gate 	 */
12190Sstevel@tonic-gate 	pi = pii->pii_phyint;
12200Sstevel@tonic-gate 	if (pi->pi_group == phyint_anongroup) {
12210Sstevel@tonic-gate 		target_add(pii, fromaddr, _B_FALSE);
12220Sstevel@tonic-gate 	} else {
12230Sstevel@tonic-gate 		pi = pi->pi_group->pg_phyint;
12240Sstevel@tonic-gate 		for (; pi != NULL; pi = pi->pi_pgnext)
12250Sstevel@tonic-gate 			target_add(PHYINT_INSTANCE(pi, af), fromaddr, _B_FALSE);
12260Sstevel@tonic-gate 	}
12270Sstevel@tonic-gate }
12280Sstevel@tonic-gate 
12290Sstevel@tonic-gate /*
12300Sstevel@tonic-gate  * Compute CRTT given an existing scaled average, scaled deviation estimate
12310Sstevel@tonic-gate  * and a new rtt time.  The formula is from Jacobson and Karels'
12320Sstevel@tonic-gate  * "Congestion Avoidance and Control" in SIGCOMM '88.  The variable names
12330Sstevel@tonic-gate  * are the same as those in Appendix A.2 of that paper.
12340Sstevel@tonic-gate  *
12350Sstevel@tonic-gate  * m = new measurement
12360Sstevel@tonic-gate  * sa = scaled RTT average (8 * average estimates)
12370Sstevel@tonic-gate  * sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates).
12380Sstevel@tonic-gate  * crtt = Conservative round trip time. Used to determine whether probe
12390Sstevel@tonic-gate  * has timed out.
12400Sstevel@tonic-gate  *
12410Sstevel@tonic-gate  * New scaled average and deviation are passed back via sap and svp
12420Sstevel@tonic-gate  */
12430Sstevel@tonic-gate static int
12440Sstevel@tonic-gate compute_crtt(int *sap, int *svp, int m)
12450Sstevel@tonic-gate {
12460Sstevel@tonic-gate 	int sa = *sap;
12470Sstevel@tonic-gate 	int sv = *svp;
12480Sstevel@tonic-gate 	int crtt;
12490Sstevel@tonic-gate 	int saved_m = m;
12500Sstevel@tonic-gate 
12510Sstevel@tonic-gate 	assert(*sap >= -1);
12520Sstevel@tonic-gate 	assert(*svp >= 0);
12530Sstevel@tonic-gate 
12540Sstevel@tonic-gate 	if (sa != -1) {
12550Sstevel@tonic-gate 		/*
12560Sstevel@tonic-gate 		 * Update average estimator:
12570Sstevel@tonic-gate 		 *	new rtt = old rtt + 1/8 Error
12580Sstevel@tonic-gate 		 *	    where Error = m - old rtt
12590Sstevel@tonic-gate 		 *	i.e. 8 * new rtt = 8 * old rtt + Error
12600Sstevel@tonic-gate 		 *	i.e. new sa =  old sa + Error
12610Sstevel@tonic-gate 		 */
12620Sstevel@tonic-gate 		m -= sa >> 3;		/* m is now Error in estimate. */
12630Sstevel@tonic-gate 		if ((sa += m) < 0) {
12640Sstevel@tonic-gate 			/* Don't allow the smoothed average to be negative. */
12650Sstevel@tonic-gate 			sa = 0;
12660Sstevel@tonic-gate 		}
12670Sstevel@tonic-gate 
12680Sstevel@tonic-gate 		/*
12690Sstevel@tonic-gate 		 * Update deviation estimator:
12700Sstevel@tonic-gate 		 *	new mdev =  old mdev + 1/4 (abs(Error) - old mdev)
12710Sstevel@tonic-gate 		 *	i.e. 4 * new mdev = 4 * old mdev +
12720Sstevel@tonic-gate 		 *		(abs(Error) - old mdev)
12730Sstevel@tonic-gate 		 * 	i.e. new sv = old sv + (abs(Error) - old mdev)
12740Sstevel@tonic-gate 		 */
12750Sstevel@tonic-gate 		if (m < 0)
12760Sstevel@tonic-gate 			m = -m;
12770Sstevel@tonic-gate 		m -= sv >> 2;
12780Sstevel@tonic-gate 		sv += m;
12790Sstevel@tonic-gate 	} else {
12800Sstevel@tonic-gate 		/* Initialization. This is the first response received. */
12810Sstevel@tonic-gate 		sa = (m << 3);
12820Sstevel@tonic-gate 		sv = (m << 1);
12830Sstevel@tonic-gate 	}
12840Sstevel@tonic-gate 
12850Sstevel@tonic-gate 	crtt = (sa >> 3) + sv;
12860Sstevel@tonic-gate 
12870Sstevel@tonic-gate 	if (debug & D_PROBE) {
12880Sstevel@tonic-gate 		logdebug("compute_crtt: m = %d sa = %d, sv = %d -> crtt = "
12890Sstevel@tonic-gate 		    "%d\n", saved_m, sa, sv, crtt);
12900Sstevel@tonic-gate 	}
12910Sstevel@tonic-gate 
12920Sstevel@tonic-gate 	*sap = sa;
12930Sstevel@tonic-gate 	*svp = sv;
12940Sstevel@tonic-gate 
12950Sstevel@tonic-gate 	/*
12960Sstevel@tonic-gate 	 * CRTT = average estimates  + 4 * deviation estimates
12970Sstevel@tonic-gate 	 *	= sa / 8 + sv
12980Sstevel@tonic-gate 	 */
12990Sstevel@tonic-gate 	return (crtt);
13000Sstevel@tonic-gate }
13010Sstevel@tonic-gate 
13020Sstevel@tonic-gate static void
13030Sstevel@tonic-gate pi_set_crtt(struct target *tg, int m, boolean_t is_probe_uni)
13040Sstevel@tonic-gate {
13050Sstevel@tonic-gate 	struct phyint_instance *pii = tg->tg_phyint_inst;
13060Sstevel@tonic-gate 	int probe_interval = pii->pii_phyint->pi_group->pg_probeint;
13070Sstevel@tonic-gate 	int sa = tg->tg_rtt_sa;
13080Sstevel@tonic-gate 	int sv = tg->tg_rtt_sd;
13090Sstevel@tonic-gate 	int new_crtt;
13100Sstevel@tonic-gate 	int i;
13110Sstevel@tonic-gate 
13120Sstevel@tonic-gate 	if (debug & D_PROBE)
13130Sstevel@tonic-gate 		logdebug("pi_set_crtt: target -  m %d\n", m);
13140Sstevel@tonic-gate 
13150Sstevel@tonic-gate 	/* store the round trip time, in case we need to defer computation */
13160Sstevel@tonic-gate 	tg->tg_deferred[tg->tg_num_deferred] = m;
13170Sstevel@tonic-gate 
13180Sstevel@tonic-gate 	new_crtt = compute_crtt(&sa, &sv, m);
13190Sstevel@tonic-gate 
13200Sstevel@tonic-gate 	/*
13210Sstevel@tonic-gate 	 * If this probe's round trip time would singlehandedly cause an
13220Sstevel@tonic-gate 	 * increase in the group's probe interval consider it suspect.
13230Sstevel@tonic-gate 	 */
13240Sstevel@tonic-gate 	if ((new_crtt > probe_interval) && is_probe_uni) {
13250Sstevel@tonic-gate 		if (debug & D_PROBE) {
13260Sstevel@tonic-gate 			logdebug("Received a suspect probe on %s, new_crtt ="
13270Sstevel@tonic-gate 			    " %d, probe_interval = %d, num_deferred = %d\n",
13280Sstevel@tonic-gate 			    pii->pii_probe_logint->li_name, new_crtt,
13290Sstevel@tonic-gate 			    probe_interval, tg->tg_num_deferred);
13300Sstevel@tonic-gate 		}
13310Sstevel@tonic-gate 
13320Sstevel@tonic-gate 		/*
13330Sstevel@tonic-gate 		 * If we've deferred as many rtts as we plan on deferring, then
13340Sstevel@tonic-gate 		 * assume the link really did slow down and process all queued
13350Sstevel@tonic-gate 		 * rtts
13360Sstevel@tonic-gate 		 */
13370Sstevel@tonic-gate 		if (tg->tg_num_deferred == MAXDEFERREDRTT) {
13380Sstevel@tonic-gate 			if (debug & D_PROBE) {
13390Sstevel@tonic-gate 				logdebug("Received MAXDEFERREDRTT probes which "
13400Sstevel@tonic-gate 				    "would cause an increased probe_interval.  "
13410Sstevel@tonic-gate 				    "Integrating queued rtt data points.\n");
13420Sstevel@tonic-gate 			}
13430Sstevel@tonic-gate 
13440Sstevel@tonic-gate 			for (i = 0; i <= tg->tg_num_deferred; i++) {
13450Sstevel@tonic-gate 				tg->tg_crtt = compute_crtt(&tg->tg_rtt_sa,
13460Sstevel@tonic-gate 				    &tg->tg_rtt_sd, tg->tg_deferred[i]);
13470Sstevel@tonic-gate 			}
13480Sstevel@tonic-gate 
13490Sstevel@tonic-gate 			tg->tg_num_deferred = 0;
13500Sstevel@tonic-gate 		} else {
13510Sstevel@tonic-gate 			tg->tg_num_deferred++;
13520Sstevel@tonic-gate 		}
13530Sstevel@tonic-gate 		return;
13540Sstevel@tonic-gate 	}
13550Sstevel@tonic-gate 
13560Sstevel@tonic-gate 	/*
13570Sstevel@tonic-gate 	 * If this is a normal probe, or an RTT probe that would lead to a
13580Sstevel@tonic-gate 	 * reduced CRTT, then update our CRTT data.  Further, if this was
13590Sstevel@tonic-gate 	 * a normal probe, pitch any deferred probes since our probes are
13600Sstevel@tonic-gate 	 * again being answered within our CRTT estimates.
13610Sstevel@tonic-gate 	 */
13620Sstevel@tonic-gate 	if (is_probe_uni || new_crtt < tg->tg_crtt) {
13630Sstevel@tonic-gate 		tg->tg_rtt_sa = sa;
13640Sstevel@tonic-gate 		tg->tg_rtt_sd = sv;
13650Sstevel@tonic-gate 		tg->tg_crtt = new_crtt;
13660Sstevel@tonic-gate 		if (is_probe_uni)
13670Sstevel@tonic-gate 			tg->tg_num_deferred = 0;
13680Sstevel@tonic-gate 	}
13690Sstevel@tonic-gate }
13700Sstevel@tonic-gate 
13710Sstevel@tonic-gate /*
13720Sstevel@tonic-gate  * Return a pointer to the specified option buffer.
13730Sstevel@tonic-gate  * If not found return NULL.
13740Sstevel@tonic-gate  */
13750Sstevel@tonic-gate static void *
13760Sstevel@tonic-gate find_ancillary(struct msghdr *msg, int cmsg_type)
13770Sstevel@tonic-gate {
13780Sstevel@tonic-gate 	struct cmsghdr *cmsg;
13790Sstevel@tonic-gate 
13800Sstevel@tonic-gate 	for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL;
13810Sstevel@tonic-gate 	    cmsg = CMSG_NXTHDR(msg, cmsg)) {
13820Sstevel@tonic-gate 		if (cmsg->cmsg_level == IPPROTO_IPV6 &&
13830Sstevel@tonic-gate 		    cmsg->cmsg_type == cmsg_type) {
13840Sstevel@tonic-gate 			return (CMSG_DATA(cmsg));
13850Sstevel@tonic-gate 		}
13860Sstevel@tonic-gate 	}
13870Sstevel@tonic-gate 	return (NULL);
13880Sstevel@tonic-gate }
13890Sstevel@tonic-gate 
13900Sstevel@tonic-gate /*
13910Sstevel@tonic-gate  * See if a previously failed interface has started working again.
13920Sstevel@tonic-gate  */
13930Sstevel@tonic-gate void
13940Sstevel@tonic-gate phyint_check_for_repair(struct phyint *pi)
13950Sstevel@tonic-gate {
13960Sstevel@tonic-gate 	if (phyint_repaired(pi)) {
13970Sstevel@tonic-gate 		if (pi->pi_group == phyint_anongroup) {
13980Sstevel@tonic-gate 			logerr("NIC repair detected on %s\n", pi->pi_name);
13990Sstevel@tonic-gate 		} else {
14000Sstevel@tonic-gate 			logerr("NIC repair detected on %s of group %s\n",
14010Sstevel@tonic-gate 			    pi->pi_name, pi->pi_group->pg_name);
14020Sstevel@tonic-gate 		}
14030Sstevel@tonic-gate 
14040Sstevel@tonic-gate 		/*
14050Sstevel@tonic-gate 		 * If the interface is offline, just clear the FAILED flag,
14060Sstevel@tonic-gate 		 * delaying the state change and failback operation until it
14070Sstevel@tonic-gate 		 * is brought back online.
14080Sstevel@tonic-gate 		 */
14090Sstevel@tonic-gate 		if (pi->pi_state == PI_OFFLINE) {
14100Sstevel@tonic-gate 			(void) change_lif_flags(pi, IFF_FAILED, _B_FALSE);
14110Sstevel@tonic-gate 			return;
14120Sstevel@tonic-gate 		}
14130Sstevel@tonic-gate 
1414704Sethindra 		if (pi->pi_flags & IFF_STANDBY) {
14150Sstevel@tonic-gate 			(void) change_lif_flags(pi, IFF_FAILED, _B_FALSE);
14160Sstevel@tonic-gate 		} else {
14172496Smeem 			if (try_failback(pi) != IPMP_FAILURE) {
14180Sstevel@tonic-gate 				(void) change_lif_flags(pi,
14190Sstevel@tonic-gate 				    IFF_FAILED, _B_FALSE);
14200Sstevel@tonic-gate 				/* Per state diagram */
14210Sstevel@tonic-gate 				pi->pi_empty = 0;
14220Sstevel@tonic-gate 			}
14230Sstevel@tonic-gate 		}
14240Sstevel@tonic-gate 
14250Sstevel@tonic-gate 		phyint_chstate(pi, PI_RUNNING);
14260Sstevel@tonic-gate 
14270Sstevel@tonic-gate 		if (GROUP_FAILED(pi->pi_group)) {
14280Sstevel@tonic-gate 			/*
14290Sstevel@tonic-gate 			 * This is the 1st phyint to receive a response
14300Sstevel@tonic-gate 			 * after group failure.
14310Sstevel@tonic-gate 			 */
14320Sstevel@tonic-gate 			logerr("At least 1 interface (%s) of group %s has "
14330Sstevel@tonic-gate 			    "repaired\n", pi->pi_name, pi->pi_group->pg_name);
14340Sstevel@tonic-gate 			phyint_group_chstate(pi->pi_group, PG_RUNNING);
14350Sstevel@tonic-gate 		}
14360Sstevel@tonic-gate 	}
14370Sstevel@tonic-gate }
14380Sstevel@tonic-gate 
14390Sstevel@tonic-gate /*
14400Sstevel@tonic-gate  * See if a previously functioning interface has failed, or if the
14410Sstevel@tonic-gate  * whole group of interfaces has failed.
14420Sstevel@tonic-gate  */
14430Sstevel@tonic-gate static void
14440Sstevel@tonic-gate phyint_inst_check_for_failure(struct phyint_instance *pii)
14450Sstevel@tonic-gate {
14460Sstevel@tonic-gate 	struct	phyint	*pi;
14470Sstevel@tonic-gate 	struct	phyint	*pi2;
14480Sstevel@tonic-gate 
14490Sstevel@tonic-gate 	pi = pii->pii_phyint;
14500Sstevel@tonic-gate 
14510Sstevel@tonic-gate 	switch (failure_state(pii)) {
14520Sstevel@tonic-gate 	case PHYINT_FAILURE:
14530Sstevel@tonic-gate 		(void) change_lif_flags(pi, IFF_FAILED, _B_TRUE);
14540Sstevel@tonic-gate 		if (pi->pi_group == phyint_anongroup) {
14550Sstevel@tonic-gate 			logerr("NIC failure detected on %s\n", pii->pii_name);
14560Sstevel@tonic-gate 		} else {
14570Sstevel@tonic-gate 			logerr("NIC failure detected on %s of group %s\n",
14580Sstevel@tonic-gate 			    pii->pii_name, pi->pi_group->pg_name);
14590Sstevel@tonic-gate 		}
14600Sstevel@tonic-gate 		/*
14610Sstevel@tonic-gate 		 * Do the failover, unless the interface is offline (in
14620Sstevel@tonic-gate 		 * which case we've already failed over).
14630Sstevel@tonic-gate 		 */
14640Sstevel@tonic-gate 		if (pi->pi_state != PI_OFFLINE) {
14650Sstevel@tonic-gate 			phyint_chstate(pi, PI_FAILED);
14660Sstevel@tonic-gate 			reset_crtt_all(pi);
14670Sstevel@tonic-gate 			if (!(pi->pi_flags & IFF_INACTIVE))
14680Sstevel@tonic-gate 				(void) try_failover(pi, FAILOVER_NORMAL);
14690Sstevel@tonic-gate 		}
14700Sstevel@tonic-gate 		break;
14710Sstevel@tonic-gate 
14720Sstevel@tonic-gate 	case GROUP_FAILURE:
14730Sstevel@tonic-gate 		logerr("All Interfaces in group %s have failed\n",
14740Sstevel@tonic-gate 		    pi->pi_group->pg_name);
14750Sstevel@tonic-gate 		for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL;
14760Sstevel@tonic-gate 		    pi2 = pi2->pi_pgnext) {
14770Sstevel@tonic-gate 			if (pi2->pi_flags & IFF_OFFLINE)
14780Sstevel@tonic-gate 				continue;
14790Sstevel@tonic-gate 			(void) change_lif_flags(pi2, IFF_FAILED, _B_TRUE);
14800Sstevel@tonic-gate 			reset_crtt_all(pi2);
14810Sstevel@tonic-gate 
14820Sstevel@tonic-gate 			/*
14830Sstevel@tonic-gate 			 * In the case of host targets, we
14840Sstevel@tonic-gate 			 * would have flushed the targets,
14850Sstevel@tonic-gate 			 * and gone to PI_NOTARGETS state.
14860Sstevel@tonic-gate 			 */
14870Sstevel@tonic-gate 			if (pi2->pi_state == PI_RUNNING)
1488704Sethindra 				phyint_chstate(pi2, PI_FAILED);
14890Sstevel@tonic-gate 
14900Sstevel@tonic-gate 			pi2->pi_empty = 0;
14910Sstevel@tonic-gate 			pi2->pi_full = 0;
14920Sstevel@tonic-gate 		}
14930Sstevel@tonic-gate 		break;
14940Sstevel@tonic-gate 
14950Sstevel@tonic-gate 	default:
14960Sstevel@tonic-gate 		break;
14970Sstevel@tonic-gate 	}
14980Sstevel@tonic-gate }
14990Sstevel@tonic-gate 
15000Sstevel@tonic-gate /*
15010Sstevel@tonic-gate  * Determines if any timeout event has occurred and returns the number of
15020Sstevel@tonic-gate  * milliseconds until the next timeout event for the phyint. Returns
15030Sstevel@tonic-gate  * TIMER_INFINITY for "never".
15040Sstevel@tonic-gate  */
15050Sstevel@tonic-gate uint_t
15060Sstevel@tonic-gate phyint_inst_timer(struct phyint_instance *pii)
15070Sstevel@tonic-gate {
15080Sstevel@tonic-gate 	int 	pr_ndx;
15090Sstevel@tonic-gate 	uint_t	timeout;
15100Sstevel@tonic-gate 	struct	target	*cur_tg;
15110Sstevel@tonic-gate 	struct	probe_stats *pr_statp;
15120Sstevel@tonic-gate 	struct	phyint_instance *pii_other;
15130Sstevel@tonic-gate 	struct	phyint *pi;
15140Sstevel@tonic-gate 	int	valid_unack_count;
15150Sstevel@tonic-gate 	int	i;
15160Sstevel@tonic-gate 	int	interval;
15170Sstevel@tonic-gate 	uint_t	check_time;
15180Sstevel@tonic-gate 	uint_t	cur_time;
15190Sstevel@tonic-gate 	hrtime_t cur_hrtime;
15200Sstevel@tonic-gate 	int	probe_interval = pii->pii_phyint->pi_group->pg_probeint;
15210Sstevel@tonic-gate 
15220Sstevel@tonic-gate 	cur_time = getcurrenttime();
15230Sstevel@tonic-gate 
15240Sstevel@tonic-gate 	if (debug & D_TIMER) {
15250Sstevel@tonic-gate 		logdebug("phyint_inst_timer(%s %s)\n",
15260Sstevel@tonic-gate 		    AF_STR(pii->pii_af), pii->pii_name);
15270Sstevel@tonic-gate 	}
15280Sstevel@tonic-gate 
15290Sstevel@tonic-gate 	pii_other = phyint_inst_other(pii);
15300Sstevel@tonic-gate 	if (!PROBE_ENABLED(pii) && !PROBE_ENABLED(pii_other)) {
15310Sstevel@tonic-gate 		/*
15320Sstevel@tonic-gate 		 * Check to see if we're here due to link up/down flapping; If
15330Sstevel@tonic-gate 		 * enough time has passed, then try to bring the interface
15340Sstevel@tonic-gate 		 * back up; otherwise, schedule a timer to bring it back up
15350Sstevel@tonic-gate 		 * when enough time *has* elapsed.
15360Sstevel@tonic-gate 		 */
15370Sstevel@tonic-gate 		pi = pii->pii_phyint;
15380Sstevel@tonic-gate 		if (pi->pi_state == PI_FAILED && LINK_UP(pi)) {
15390Sstevel@tonic-gate 			check_time = pi->pi_whenup[pi->pi_whendx] + MSEC_PERMIN;
15400Sstevel@tonic-gate 			if (check_time > cur_time)
15410Sstevel@tonic-gate 				return (check_time - cur_time);
15420Sstevel@tonic-gate 
15430Sstevel@tonic-gate 			phyint_check_for_repair(pi);
15440Sstevel@tonic-gate 		}
15450Sstevel@tonic-gate 	}
15460Sstevel@tonic-gate 
15470Sstevel@tonic-gate 	/*
15482496Smeem 	 * If probing is not enabled on this phyint instance, don't proceed.
15490Sstevel@tonic-gate 	 */
15502496Smeem 	if (!PROBE_ENABLED(pii))
15510Sstevel@tonic-gate 		return (TIMER_INFINITY);
15520Sstevel@tonic-gate 
15530Sstevel@tonic-gate 	/*
15540Sstevel@tonic-gate 	 * If the timer has fired too soon, probably triggered
15550Sstevel@tonic-gate 	 * by some other phyint instance, return the remaining
15560Sstevel@tonic-gate 	 * time
15570Sstevel@tonic-gate 	 */
15580Sstevel@tonic-gate 	if (TIME_LT(cur_time, pii->pii_snxt_time))
15590Sstevel@tonic-gate 		return (pii->pii_snxt_time - cur_time);
15600Sstevel@tonic-gate 
15610Sstevel@tonic-gate 	/*
15620Sstevel@tonic-gate 	 * If the link is down, don't send any probes for now.
15630Sstevel@tonic-gate 	 */
15640Sstevel@tonic-gate 	if (LINK_DOWN(pii->pii_phyint))
15650Sstevel@tonic-gate 		return (TIMER_INFINITY);
15660Sstevel@tonic-gate 
15670Sstevel@tonic-gate 	/*
15680Sstevel@tonic-gate 	 * Randomize the next probe time, between MIN_RANDOM_FACTOR
15690Sstevel@tonic-gate 	 * and MAX_RANDOM_FACTOR with respect to the base probe time.
15700Sstevel@tonic-gate 	 * Base probe time is strictly periodic.
15710Sstevel@tonic-gate 	 */
15720Sstevel@tonic-gate 	interval = GET_RANDOM(
15730Sstevel@tonic-gate 	    (int)(MIN_RANDOM_FACTOR * user_probe_interval),
15740Sstevel@tonic-gate 	    (int)(MAX_RANDOM_FACTOR * user_probe_interval));
15750Sstevel@tonic-gate 	pii->pii_snxt_time = pii->pii_snxt_basetime + interval;
15760Sstevel@tonic-gate 
15770Sstevel@tonic-gate 	/*
15780Sstevel@tonic-gate 	 * Check if the current time > next time to probe. If so, we missed
15790Sstevel@tonic-gate 	 * sending 1 or more probes, probably due to heavy system load. At least
15800Sstevel@tonic-gate 	 * 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we
15810Sstevel@tonic-gate 	 * were scheduled. Make adjustments to the times, in multiples of
15820Sstevel@tonic-gate 	 * user_probe_interval.
15830Sstevel@tonic-gate 	 */
15840Sstevel@tonic-gate 	if (TIME_GT(cur_time, pii->pii_snxt_time)) {
15850Sstevel@tonic-gate 		int n;
15860Sstevel@tonic-gate 
15870Sstevel@tonic-gate 		n = (cur_time - pii->pii_snxt_time) / user_probe_interval;
15880Sstevel@tonic-gate 		pii->pii_snxt_time 	+= (n + 1) * user_probe_interval;
15890Sstevel@tonic-gate 		pii->pii_snxt_basetime 	+= (n + 1) * user_probe_interval;
15900Sstevel@tonic-gate 		logtrace("missed sending %d probes cur_time %u snxt_time %u"
15910Sstevel@tonic-gate 		    " snxt_basetime %u\n", n + 1, cur_time, pii->pii_snxt_time,
15920Sstevel@tonic-gate 		    pii->pii_snxt_basetime);
15930Sstevel@tonic-gate 
15940Sstevel@tonic-gate 		/* Collect statistics about missed probes */
15950Sstevel@tonic-gate 		probes_missed.pm_nprobes += n + 1;
15960Sstevel@tonic-gate 		probes_missed.pm_ntimes++;
15970Sstevel@tonic-gate 	}
15980Sstevel@tonic-gate 	pii->pii_snxt_basetime += user_probe_interval;
15990Sstevel@tonic-gate 	interval = pii->pii_snxt_time - cur_time;
16000Sstevel@tonic-gate 	if (debug & D_TARGET) {
16010Sstevel@tonic-gate 		logdebug("cur_time %u snxt_time %u snxt_basetime %u"
16020Sstevel@tonic-gate 		    " interval %u\n", cur_time, pii->pii_snxt_time,
16030Sstevel@tonic-gate 		    pii->pii_snxt_basetime, interval);
16040Sstevel@tonic-gate 	}
16050Sstevel@tonic-gate 
16060Sstevel@tonic-gate 	/*
16070Sstevel@tonic-gate 	 * If no targets are known, we need to send an ICMP multicast. The
16080Sstevel@tonic-gate 	 * probe type is PROBE_MULTI.  We'll check back in 'interval' msec
16090Sstevel@tonic-gate 	 * to see if we found a target.
16100Sstevel@tonic-gate 	 */
16110Sstevel@tonic-gate 	if (pii->pii_target_next == NULL) {
16120Sstevel@tonic-gate 		assert(pii->pii_ntargets == 0);
16130Sstevel@tonic-gate 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
16140Sstevel@tonic-gate 		probe(pii, PROBE_MULTI, cur_time);
16150Sstevel@tonic-gate 		return (interval);
16160Sstevel@tonic-gate 	}
16170Sstevel@tonic-gate 
16180Sstevel@tonic-gate 	if ((user_probe_interval != probe_interval) &&
16190Sstevel@tonic-gate 	    TIME_LT(pii->pii_snxt_time, pii->pii_fd_snxt_basetime)) {
16200Sstevel@tonic-gate 		/*
16210Sstevel@tonic-gate 		 * the failure detection (fd) probe timer has not yet fired.
16220Sstevel@tonic-gate 		 * Need to send only an rtt probe. The probe type is PROBE_RTT.
16230Sstevel@tonic-gate 		 */
16240Sstevel@tonic-gate 		probe(pii, PROBE_RTT, cur_time);
16250Sstevel@tonic-gate 		return (interval);
16260Sstevel@tonic-gate 	}
16270Sstevel@tonic-gate 	/*
16280Sstevel@tonic-gate 	 * the fd probe timer has fired. Need to do all failure
16290Sstevel@tonic-gate 	 * detection / recovery calculations, and then send an fd probe
16300Sstevel@tonic-gate 	 * of type PROBE_UNI.
16310Sstevel@tonic-gate 	 */
16320Sstevel@tonic-gate 	if (user_probe_interval == probe_interval) {
16330Sstevel@tonic-gate 		/*
16340Sstevel@tonic-gate 		 * We could have missed some probes, and then adjusted
16350Sstevel@tonic-gate 		 * pii_snxt_basetime above. Otherwise we could have
16360Sstevel@tonic-gate 		 * blindly added probe_interval to pii_fd_snxt_basetime.
16370Sstevel@tonic-gate 		 */
16380Sstevel@tonic-gate 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
16390Sstevel@tonic-gate 	} else {
16400Sstevel@tonic-gate 		pii->pii_fd_snxt_basetime += probe_interval;
16410Sstevel@tonic-gate 		if (TIME_GT(cur_time, pii->pii_fd_snxt_basetime)) {
16420Sstevel@tonic-gate 			int n;
16430Sstevel@tonic-gate 
16440Sstevel@tonic-gate 			n = (cur_time - pii->pii_fd_snxt_basetime) /
16450Sstevel@tonic-gate 			    probe_interval;
16460Sstevel@tonic-gate 			pii->pii_fd_snxt_basetime += (n + 1) * probe_interval;
16470Sstevel@tonic-gate 		}
16480Sstevel@tonic-gate 	}
16490Sstevel@tonic-gate 
16500Sstevel@tonic-gate 	/*
16510Sstevel@tonic-gate 	 * We can have at most, the latest 2 probes that we sent, in
16520Sstevel@tonic-gate 	 * the PR_UNACKED state. All previous probes sent, are either
16530Sstevel@tonic-gate 	 * PR_LOST or PR_ACKED. An unacknowledged probe is considered
16540Sstevel@tonic-gate 	 * timed out if the probe's time_sent + the CRTT < currenttime.
16550Sstevel@tonic-gate 	 * For each of the last 2 probes, examine whether it has timed
16560Sstevel@tonic-gate 	 * out. If so, mark it PR_LOST. The probe stats is a circular array.
16570Sstevel@tonic-gate 	 */
16580Sstevel@tonic-gate 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
16590Sstevel@tonic-gate 	valid_unack_count = 0;
16600Sstevel@tonic-gate 
16610Sstevel@tonic-gate 	for (i = 0; i < 2; i++) {
16620Sstevel@tonic-gate 		pr_statp = &pii->pii_probes[pr_ndx];
16630Sstevel@tonic-gate 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
16640Sstevel@tonic-gate 		switch (pr_statp->pr_status) {
16650Sstevel@tonic-gate 		case PR_ACKED:
16660Sstevel@tonic-gate 			/*
16670Sstevel@tonic-gate 			 * We received back an ACK, so the switch clearly
16680Sstevel@tonic-gate 			 * is not dropping our traffic, and thus we can
16690Sstevel@tonic-gate 			 * enable failure detection immediately.
16700Sstevel@tonic-gate 			 */
16710Sstevel@tonic-gate 			if (pii->pii_fd_hrtime > gethrtime()) {
16720Sstevel@tonic-gate 				if (debug & D_PROBE) {
16730Sstevel@tonic-gate 					logdebug("successful probe on %s; "
16740Sstevel@tonic-gate 					    "ending quiet period\n",
16750Sstevel@tonic-gate 					    pii->pii_phyint->pi_name);
16760Sstevel@tonic-gate 				}
16770Sstevel@tonic-gate 				pii->pii_fd_hrtime = gethrtime();
16780Sstevel@tonic-gate 			}
16790Sstevel@tonic-gate 			break;
16800Sstevel@tonic-gate 
16810Sstevel@tonic-gate 		case PR_UNACKED:
16820Sstevel@tonic-gate 			assert(cur_tg != NULL);
16830Sstevel@tonic-gate 			/*
16840Sstevel@tonic-gate 			 * The crtt could be zero for some reason,
16850Sstevel@tonic-gate 			 * Eg. the phyint could be failed. If the crtt is
16860Sstevel@tonic-gate 			 * not available use group's probe interval,
16870Sstevel@tonic-gate 			 * which is a worst case estimate.
16880Sstevel@tonic-gate 			 */
16890Sstevel@tonic-gate 			if (cur_tg->tg_crtt != 0) {
16900Sstevel@tonic-gate 				timeout = pr_statp->pr_time_sent +
16910Sstevel@tonic-gate 				    cur_tg->tg_crtt;
16920Sstevel@tonic-gate 			} else {
16930Sstevel@tonic-gate 				timeout = pr_statp->pr_time_sent +
16940Sstevel@tonic-gate 				    probe_interval;
16950Sstevel@tonic-gate 			}
16960Sstevel@tonic-gate 			if (TIME_LT(timeout, cur_time)) {
16970Sstevel@tonic-gate 				pr_statp->pr_status = PR_LOST;
16980Sstevel@tonic-gate 				pr_statp->pr_time_lost = timeout;
16990Sstevel@tonic-gate 			} else if (i == 1) {
17000Sstevel@tonic-gate 				/*
17010Sstevel@tonic-gate 				 * We are forced to consider this probe
17020Sstevel@tonic-gate 				 * lost, as we can have at most 2 unack.
17030Sstevel@tonic-gate 				 * probes any time, and we will be sending a
17040Sstevel@tonic-gate 				 * probe at the end of this function.
17050Sstevel@tonic-gate 				 * Normally, we should not be here, but
17060Sstevel@tonic-gate 				 * this can happen if an incoming response
17070Sstevel@tonic-gate 				 * that was considered lost has increased
17080Sstevel@tonic-gate 				 * the crtt for this target, and also bumped
17090Sstevel@tonic-gate 				 * up the FDT. Note that we never cancel or
17100Sstevel@tonic-gate 				 * increase the current pii_time_left, so
17110Sstevel@tonic-gate 				 * when the timer fires, we find 2 valid
17120Sstevel@tonic-gate 				 * unacked probes, and they are yet to timeout
17130Sstevel@tonic-gate 				 */
17140Sstevel@tonic-gate 				pr_statp->pr_status = PR_LOST;
17150Sstevel@tonic-gate 				pr_statp->pr_time_lost = cur_time;
17160Sstevel@tonic-gate 			} else {
17170Sstevel@tonic-gate 				/*
17180Sstevel@tonic-gate 				 * Only the most recent probe can enter
17190Sstevel@tonic-gate 				 * this 'else' arm. The second most recent
17200Sstevel@tonic-gate 				 * probe must take either of the above arms,
17210Sstevel@tonic-gate 				 * if it is unacked.
17220Sstevel@tonic-gate 				 */
17230Sstevel@tonic-gate 				valid_unack_count++;
17240Sstevel@tonic-gate 			}
17250Sstevel@tonic-gate 			break;
17260Sstevel@tonic-gate 		}
17270Sstevel@tonic-gate 		pr_ndx = PROBE_INDEX_PREV(pr_ndx);
17280Sstevel@tonic-gate 	}
17290Sstevel@tonic-gate 
17300Sstevel@tonic-gate 	/*
17310Sstevel@tonic-gate 	 * We send out 1 probe randomly in the interval between one half
17320Sstevel@tonic-gate 	 * and one probe interval for the group. Given that the CRTT is always
17330Sstevel@tonic-gate 	 * less than the group's probe interval, we can have at most 1
17340Sstevel@tonic-gate 	 * unacknowledged probe now.  All previous probes are either lost or
17350Sstevel@tonic-gate 	 * acked.
17360Sstevel@tonic-gate 	 */
17370Sstevel@tonic-gate 	assert(valid_unack_count == 0 || valid_unack_count == 1);
17380Sstevel@tonic-gate 
17390Sstevel@tonic-gate 	/*
17400Sstevel@tonic-gate 	 * The timer has fired. Take appropriate action depending
17410Sstevel@tonic-gate 	 * on the current state of the phyint.
17420Sstevel@tonic-gate 	 *
17430Sstevel@tonic-gate 	 * PI_RUNNING state 	- Failure detection and failover
17440Sstevel@tonic-gate 	 * PI_FAILED state 	- Repair detection and failback
17450Sstevel@tonic-gate 	 */
17460Sstevel@tonic-gate 	switch (pii->pii_phyint->pi_state) {
17470Sstevel@tonic-gate 	case PI_FAILED:
17480Sstevel@tonic-gate 		/*
17490Sstevel@tonic-gate 		 * If the most recent probe (excluding unacked probes that
17500Sstevel@tonic-gate 		 * are yet to time out) has been acked, check whether the
17510Sstevel@tonic-gate 		 * phyint is now repaired. If the phyint is repaired, then
17520Sstevel@tonic-gate 		 * attempt failback, unless it is an inactive standby.
17530Sstevel@tonic-gate 		 */
17540Sstevel@tonic-gate 		if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) {
17550Sstevel@tonic-gate 			phyint_check_for_repair(pii->pii_phyint);
17560Sstevel@tonic-gate 		}
17570Sstevel@tonic-gate 		break;
17580Sstevel@tonic-gate 
17590Sstevel@tonic-gate 	case PI_RUNNING:
17600Sstevel@tonic-gate 		/*
17610Sstevel@tonic-gate 		 * It's possible our probes have been lost because of a
17620Sstevel@tonic-gate 		 * spanning-tree mandated quiet period on the switch.  If so,
17630Sstevel@tonic-gate 		 * ignore the lost probes and consider the interface to still
17640Sstevel@tonic-gate 		 * be functioning.
17650Sstevel@tonic-gate 		 */
17660Sstevel@tonic-gate 		cur_hrtime = gethrtime();
17670Sstevel@tonic-gate 		if (pii->pii_fd_hrtime - cur_hrtime > 0)
17680Sstevel@tonic-gate 			break;
17690Sstevel@tonic-gate 
17700Sstevel@tonic-gate 		if (pii->pii_rack + valid_unack_count + 1 != pii->pii_snxt) {
17710Sstevel@tonic-gate 			/*
17720Sstevel@tonic-gate 			 * We have 1 or more failed probes (excluding unacked
17730Sstevel@tonic-gate 			 * probes that are yet to time out). Determine if the
17740Sstevel@tonic-gate 			 * phyint has failed. If so attempt a failover,
17750Sstevel@tonic-gate 			 * unless it is an inactive standby
17760Sstevel@tonic-gate 			 */
17770Sstevel@tonic-gate 			phyint_inst_check_for_failure(pii);
17780Sstevel@tonic-gate 		}
17790Sstevel@tonic-gate 		break;
17800Sstevel@tonic-gate 
17810Sstevel@tonic-gate 	default:
17820Sstevel@tonic-gate 		logerr("phyint_inst_timer: invalid state %d\n",
17830Sstevel@tonic-gate 		    pii->pii_phyint->pi_state);
17840Sstevel@tonic-gate 		abort();
17850Sstevel@tonic-gate 	}
17860Sstevel@tonic-gate 
17870Sstevel@tonic-gate 	/*
17880Sstevel@tonic-gate 	 * Start the next probe. probe() will also set pii->pii_probe_time_left
17890Sstevel@tonic-gate 	 * to the group's probe interval. If phyint_failed -> target_flush_hosts
17900Sstevel@tonic-gate 	 * was called, the target list may be empty.
17910Sstevel@tonic-gate 	 */
17920Sstevel@tonic-gate 	if (pii->pii_target_next != NULL) {
17930Sstevel@tonic-gate 		probe(pii, PROBE_UNI, cur_time);
17940Sstevel@tonic-gate 		/*
17950Sstevel@tonic-gate 		 * If we have just the one probe target, and we're not using
17960Sstevel@tonic-gate 		 * router targets, try to find another as we presently have
17970Sstevel@tonic-gate 		 * no resilience.
17980Sstevel@tonic-gate 		 */
17990Sstevel@tonic-gate 		if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1)
18000Sstevel@tonic-gate 			probe(pii, PROBE_MULTI, cur_time);
18010Sstevel@tonic-gate 	} else {
18020Sstevel@tonic-gate 		probe(pii, PROBE_MULTI, cur_time);
18030Sstevel@tonic-gate 	}
18040Sstevel@tonic-gate 	return (interval);
18050Sstevel@tonic-gate }
18060Sstevel@tonic-gate 
18070Sstevel@tonic-gate /*
18080Sstevel@tonic-gate  * Start the probe timer for an interface instance.
18090Sstevel@tonic-gate  */
18100Sstevel@tonic-gate void
18110Sstevel@tonic-gate start_timer(struct phyint_instance *pii)
18120Sstevel@tonic-gate {
18130Sstevel@tonic-gate 	uint32_t interval;
18140Sstevel@tonic-gate 
18150Sstevel@tonic-gate 	/*
18160Sstevel@tonic-gate 	 * Spread the base probe times (pi_snxt_basetime) across phyints
18170Sstevel@tonic-gate 	 * uniformly over the (curtime..curtime + the group's probe_interval).
18180Sstevel@tonic-gate 	 * pi_snxt_basetime is strictly periodic with a frequency of
18190Sstevel@tonic-gate 	 * the group's probe interval. The actual probe time pi_snxt_time
18200Sstevel@tonic-gate 	 * adds some randomness to pi_snxt_basetime and happens in probe().
18210Sstevel@tonic-gate 	 * For the 1st probe on each phyint after the timer is started,
18220Sstevel@tonic-gate 	 * pi_snxt_time and pi_snxt_basetime are the same.
18230Sstevel@tonic-gate 	 */
18240Sstevel@tonic-gate 	interval = GET_RANDOM(0,
18250Sstevel@tonic-gate 	    (int)pii->pii_phyint->pi_group->pg_probeint);
18260Sstevel@tonic-gate 
18270Sstevel@tonic-gate 	pii->pii_snxt_basetime = getcurrenttime() + interval;
18280Sstevel@tonic-gate 	pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
18290Sstevel@tonic-gate 	pii->pii_snxt_time = pii->pii_snxt_basetime;
18300Sstevel@tonic-gate 	timer_schedule(interval);
18310Sstevel@tonic-gate }
18320Sstevel@tonic-gate 
18330Sstevel@tonic-gate /*
18340Sstevel@tonic-gate  * Restart the probe timer on an interface instance.
18350Sstevel@tonic-gate  */
18360Sstevel@tonic-gate static void
18370Sstevel@tonic-gate restart_timer(struct phyint_instance *pii)
18380Sstevel@tonic-gate {
18390Sstevel@tonic-gate 	/*
18400Sstevel@tonic-gate 	 * We don't need to restart the timer if it was never started in
18410Sstevel@tonic-gate 	 * the first place (pii->pii_basetime_inited not set), as the timer
18420Sstevel@tonic-gate 	 * won't have gone off yet.
18430Sstevel@tonic-gate 	 */
18440Sstevel@tonic-gate 	if (pii->pii_basetime_inited != 0) {
18450Sstevel@tonic-gate 
18460Sstevel@tonic-gate 		if (debug & D_LINKNOTE)
18470Sstevel@tonic-gate 			logdebug("restart timer: restarting timer on %s, "
18480Sstevel@tonic-gate 			    "address family %s\n", pii->pii_phyint->pi_name,
18490Sstevel@tonic-gate 			    AF_STR(pii->pii_af));
18500Sstevel@tonic-gate 
18510Sstevel@tonic-gate 		start_timer(pii);
18520Sstevel@tonic-gate 	}
18530Sstevel@tonic-gate }
18540Sstevel@tonic-gate 
18550Sstevel@tonic-gate static void
18560Sstevel@tonic-gate process_link_state_down(struct phyint *pi)
18570Sstevel@tonic-gate {
18580Sstevel@tonic-gate 	logerr("The link has gone down on %s\n", pi->pi_name);
18590Sstevel@tonic-gate 
18600Sstevel@tonic-gate 	/*
18610Sstevel@tonic-gate 	 * Clear the probe statistics arrays, we don't want the repair
18620Sstevel@tonic-gate 	 * detection logic relying on probes that were succesful prior
18630Sstevel@tonic-gate 	 *  to the link going down.
18640Sstevel@tonic-gate 	 */
18650Sstevel@tonic-gate 	if (PROBE_CAPABLE(pi->pi_v4))
18660Sstevel@tonic-gate 		clear_pii_probe_stats(pi->pi_v4);
18670Sstevel@tonic-gate 	if (PROBE_CAPABLE(pi->pi_v6))
18680Sstevel@tonic-gate 		clear_pii_probe_stats(pi->pi_v6);
18690Sstevel@tonic-gate 	/*
18700Sstevel@tonic-gate 	 * Check for interface failure.  Although we know the interface
18710Sstevel@tonic-gate 	 * has failed, we don't know if all the other interfaces in the
18720Sstevel@tonic-gate 	 * group have failed as well.
18730Sstevel@tonic-gate 	 */
18740Sstevel@tonic-gate 	if ((pi->pi_state == PI_RUNNING) ||
18750Sstevel@tonic-gate 	    (pi->pi_state != PI_FAILED && !GROUP_FAILED(pi->pi_group))) {
18760Sstevel@tonic-gate 		if (debug & D_LINKNOTE) {
18770Sstevel@tonic-gate 			logdebug("process_link_state_down:"
18780Sstevel@tonic-gate 			    " checking for failure on %s\n", pi->pi_name);
18790Sstevel@tonic-gate 		}
18800Sstevel@tonic-gate 
18810Sstevel@tonic-gate 		if (pi->pi_v4 != NULL)
18820Sstevel@tonic-gate 			phyint_inst_check_for_failure(pi->pi_v4);
18830Sstevel@tonic-gate 		else if (pi->pi_v6 != NULL)
18840Sstevel@tonic-gate 			phyint_inst_check_for_failure(pi->pi_v6);
18850Sstevel@tonic-gate 	}
18860Sstevel@tonic-gate }
18870Sstevel@tonic-gate 
18880Sstevel@tonic-gate static void
18890Sstevel@tonic-gate process_link_state_up(struct phyint *pi)
18900Sstevel@tonic-gate {
18910Sstevel@tonic-gate 	logerr("The link has come up on %s\n", pi->pi_name);
18920Sstevel@tonic-gate 
18930Sstevel@tonic-gate 	/*
18940Sstevel@tonic-gate 	 * We stopped any running timers on each instance when the link
18950Sstevel@tonic-gate 	 * went down, so restart them.
18960Sstevel@tonic-gate 	 */
18970Sstevel@tonic-gate 	if (pi->pi_v4)
18980Sstevel@tonic-gate 		restart_timer(pi->pi_v4);
18990Sstevel@tonic-gate 	if (pi->pi_v6)
19000Sstevel@tonic-gate 		restart_timer(pi->pi_v6);
19010Sstevel@tonic-gate 
19020Sstevel@tonic-gate 	phyint_check_for_repair(pi);
19030Sstevel@tonic-gate 
19040Sstevel@tonic-gate 	pi->pi_whenup[pi->pi_whendx++] = getcurrenttime();
19050Sstevel@tonic-gate 	if (pi->pi_whendx == LINK_UP_PERMIN)
19060Sstevel@tonic-gate 		pi->pi_whendx = 0;
19070Sstevel@tonic-gate }
19080Sstevel@tonic-gate 
19090Sstevel@tonic-gate /*
19100Sstevel@tonic-gate  * Process any changes in link state passed up from the interfaces.
19110Sstevel@tonic-gate  */
19120Sstevel@tonic-gate void
19130Sstevel@tonic-gate process_link_state_changes(void)
19140Sstevel@tonic-gate {
19150Sstevel@tonic-gate 	struct phyint *pi;
19160Sstevel@tonic-gate 
19170Sstevel@tonic-gate 	/* Look for interfaces where the link state has just changed */
19180Sstevel@tonic-gate 
19190Sstevel@tonic-gate 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
19200Sstevel@tonic-gate 		boolean_t old_link_state_up = LINK_UP(pi);
19210Sstevel@tonic-gate 
19220Sstevel@tonic-gate 		/*
19230Sstevel@tonic-gate 		 * Except when the "phyint" structure is created, this is
19240Sstevel@tonic-gate 		 * the only place the link state is updated.  This allows
19250Sstevel@tonic-gate 		 * this routine to detect changes in link state, rather
19260Sstevel@tonic-gate 		 * than just the current state.
19270Sstevel@tonic-gate 		 */
19280Sstevel@tonic-gate 		UPDATE_LINK_STATE(pi);
19290Sstevel@tonic-gate 
19300Sstevel@tonic-gate 		if (LINK_DOWN(pi)) {
19310Sstevel@tonic-gate 			/*
19320Sstevel@tonic-gate 			 * Has link just gone down?
19330Sstevel@tonic-gate 			 */
19340Sstevel@tonic-gate 			if (old_link_state_up)
19350Sstevel@tonic-gate 				process_link_state_down(pi);
19360Sstevel@tonic-gate 		} else {
19370Sstevel@tonic-gate 			/*
19380Sstevel@tonic-gate 			 * Has link just gone back up?
19390Sstevel@tonic-gate 			 */
19400Sstevel@tonic-gate 			if (!old_link_state_up)
19410Sstevel@tonic-gate 				process_link_state_up(pi);
19420Sstevel@tonic-gate 		}
19430Sstevel@tonic-gate 	}
19440Sstevel@tonic-gate }
19450Sstevel@tonic-gate 
19460Sstevel@tonic-gate void
19470Sstevel@tonic-gate reset_crtt_all(struct phyint *pi)
19480Sstevel@tonic-gate {
19490Sstevel@tonic-gate 	struct phyint_instance *pii;
19500Sstevel@tonic-gate 	struct target *tg;
19510Sstevel@tonic-gate 
19520Sstevel@tonic-gate 	pii = pi->pi_v4;
19530Sstevel@tonic-gate 	if (pii != NULL) {
19540Sstevel@tonic-gate 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
19550Sstevel@tonic-gate 			tg->tg_crtt = 0;
19560Sstevel@tonic-gate 			tg->tg_rtt_sa = -1;
19570Sstevel@tonic-gate 			tg->tg_rtt_sd = 0;
19580Sstevel@tonic-gate 		}
19590Sstevel@tonic-gate 	}
19600Sstevel@tonic-gate 
19610Sstevel@tonic-gate 	pii = pi->pi_v6;
19620Sstevel@tonic-gate 	if (pii != NULL) {
19630Sstevel@tonic-gate 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
19640Sstevel@tonic-gate 			tg->tg_crtt = 0;
19650Sstevel@tonic-gate 			tg->tg_rtt_sa = -1;
19660Sstevel@tonic-gate 			tg->tg_rtt_sd = 0;
19670Sstevel@tonic-gate 		}
19680Sstevel@tonic-gate 	}
19690Sstevel@tonic-gate }
19700Sstevel@tonic-gate 
19710Sstevel@tonic-gate /*
19720Sstevel@tonic-gate  * Check if the phyint has failed the last NUM_PROBE_FAILS consecutive
19730Sstevel@tonic-gate  * probes on both instances IPv4 and IPv6.
19740Sstevel@tonic-gate  * If the interface has failed, return the time of the first probe failure
19750Sstevel@tonic-gate  * in "tff".
19760Sstevel@tonic-gate  */
19770Sstevel@tonic-gate static int
19780Sstevel@tonic-gate phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff)
19790Sstevel@tonic-gate {
19800Sstevel@tonic-gate 	uint_t	pi_tff;
19810Sstevel@tonic-gate 	struct	target *cur_tg;
19820Sstevel@tonic-gate 	struct	probe_fail_count pfinfo;
19830Sstevel@tonic-gate 	struct	phyint_instance *pii_other;
19840Sstevel@tonic-gate 	int	pr_ndx;
19850Sstevel@tonic-gate 
19860Sstevel@tonic-gate 	/*
19870Sstevel@tonic-gate 	 * Get the number of consecutive failed probes on
19880Sstevel@tonic-gate 	 * this phyint across all targets. Also get the number
19890Sstevel@tonic-gate 	 * of consecutive failed probes on this target only
19900Sstevel@tonic-gate 	 */
19910Sstevel@tonic-gate 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
19920Sstevel@tonic-gate 	cur_tg = pii->pii_probes[pr_ndx].pr_target;
19930Sstevel@tonic-gate 	probe_fail_info(pii, cur_tg, &pfinfo);
19940Sstevel@tonic-gate 
19950Sstevel@tonic-gate 	/* Get the time of first failure, for later use */
19960Sstevel@tonic-gate 	pi_tff = pfinfo.pf_tff;
19970Sstevel@tonic-gate 
19980Sstevel@tonic-gate 	/*
19990Sstevel@tonic-gate 	 * If the current target has not responded to the
20000Sstevel@tonic-gate 	 * last NUM_PROBE_FAILS probes, and other targets are
20010Sstevel@tonic-gate 	 * responding delete this target. Dead gateway detection
20020Sstevel@tonic-gate 	 * will eventually remove this target (if router) from the
20030Sstevel@tonic-gate 	 * routing tables. If that does not occur, we may end
20040Sstevel@tonic-gate 	 * up adding this to our list again.
20050Sstevel@tonic-gate 	 */
20060Sstevel@tonic-gate 	if (pfinfo.pf_nfail < NUM_PROBE_FAILS &&
20070Sstevel@tonic-gate 	    pfinfo.pf_nfail_tg >= NUM_PROBE_FAILS) {
20080Sstevel@tonic-gate 		if (pii->pii_targets_are_routers) {
20090Sstevel@tonic-gate 			if (cur_tg->tg_status == TG_ACTIVE)
20100Sstevel@tonic-gate 				pii->pii_ntargets--;
20110Sstevel@tonic-gate 			cur_tg->tg_status = TG_DEAD;
20120Sstevel@tonic-gate 			cur_tg->tg_crtt = 0;
20130Sstevel@tonic-gate 			cur_tg->tg_rtt_sa = -1;
20140Sstevel@tonic-gate 			cur_tg->tg_rtt_sd = 0;
20150Sstevel@tonic-gate 			if (pii->pii_target_next == cur_tg)
20160Sstevel@tonic-gate 				pii->pii_target_next = target_next(cur_tg);
20170Sstevel@tonic-gate 		} else {
20180Sstevel@tonic-gate 			target_delete(cur_tg);
20190Sstevel@tonic-gate 			probe(pii, PROBE_MULTI, getcurrenttime());
20200Sstevel@tonic-gate 		}
20210Sstevel@tonic-gate 		return (PHYINT_OK);
20220Sstevel@tonic-gate 	}
20230Sstevel@tonic-gate 
20240Sstevel@tonic-gate 	/*
20250Sstevel@tonic-gate 	 * If the phyint has lost NUM_PROBE_FAILS or more
20260Sstevel@tonic-gate 	 * consecutive probes, on both IPv4 and IPv6 protocol
20270Sstevel@tonic-gate 	 * instances of the phyint, then trigger failure
20280Sstevel@tonic-gate 	 * detection, else return false
20290Sstevel@tonic-gate 	 */
20300Sstevel@tonic-gate 	if (pfinfo.pf_nfail < NUM_PROBE_FAILS)
20310Sstevel@tonic-gate 		return (PHYINT_OK);
20320Sstevel@tonic-gate 
20330Sstevel@tonic-gate 	pii_other = phyint_inst_other(pii);
20340Sstevel@tonic-gate 	if (PROBE_CAPABLE(pii_other)) {
20350Sstevel@tonic-gate 		probe_fail_info(pii_other, NULL, &pfinfo);
20360Sstevel@tonic-gate 		if (pfinfo.pf_nfail >= NUM_PROBE_FAILS) {
20370Sstevel@tonic-gate 			/*
20380Sstevel@tonic-gate 			 * We have NUM_PROBE_FAILS or more failures
20390Sstevel@tonic-gate 			 * on both IPv4 and IPv6. Get the earliest
20400Sstevel@tonic-gate 			 * time when failure was detected on this
20410Sstevel@tonic-gate 			 * phyint across IPv4 and IPv6.
20420Sstevel@tonic-gate 			 */
20430Sstevel@tonic-gate 			if (TIME_LT(pfinfo.pf_tff, pi_tff))
20440Sstevel@tonic-gate 				pi_tff = pfinfo.pf_tff;
20450Sstevel@tonic-gate 		} else {
20460Sstevel@tonic-gate 			/*
20470Sstevel@tonic-gate 			 * This instance has < NUM_PROBE_FAILS failure.
20480Sstevel@tonic-gate 			 * So return false
20490Sstevel@tonic-gate 			 */
20500Sstevel@tonic-gate 			return (PHYINT_OK);
20510Sstevel@tonic-gate 		}
20520Sstevel@tonic-gate 	}
20530Sstevel@tonic-gate 	*tff = pi_tff;
20540Sstevel@tonic-gate 	return (PHYINT_FAILURE);
20550Sstevel@tonic-gate }
20560Sstevel@tonic-gate 
20570Sstevel@tonic-gate /*
20580Sstevel@tonic-gate  * Check if the link has gone down on this phyint, or it has failed the
20590Sstevel@tonic-gate  * last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6.
20600Sstevel@tonic-gate  * Also look at other phyints of this group, for group failures.
20610Sstevel@tonic-gate  */
20620Sstevel@tonic-gate int
20630Sstevel@tonic-gate failure_state(struct phyint_instance *pii)
20640Sstevel@tonic-gate {
20650Sstevel@tonic-gate 	struct	probe_success_count psinfo;
20660Sstevel@tonic-gate 	uint_t	pi2_tls;		/* time last success */
20670Sstevel@tonic-gate 	uint_t	pi_tff;			/* time first fail */
20680Sstevel@tonic-gate 	struct	phyint	*pi2;
20690Sstevel@tonic-gate 	struct	phyint *pi;
20700Sstevel@tonic-gate 	struct	phyint_instance *pii2;
20710Sstevel@tonic-gate 	struct  phyint_group *pg;
20720Sstevel@tonic-gate 	boolean_t alone;
20730Sstevel@tonic-gate 
20740Sstevel@tonic-gate 	if (debug & D_FAILOVER)
20750Sstevel@tonic-gate 		logdebug("phyint_failed(%s)\n", pii->pii_name);
20760Sstevel@tonic-gate 
20770Sstevel@tonic-gate 	pi = pii->pii_phyint;
20780Sstevel@tonic-gate 	pg = pi->pi_group;
20790Sstevel@tonic-gate 
20800Sstevel@tonic-gate 	if (LINK_UP(pi) && phyint_inst_probe_failure_state(pii, &pi_tff) ==
2081*4929Srk129064 	    PHYINT_OK)
20820Sstevel@tonic-gate 		return (PHYINT_OK);
20830Sstevel@tonic-gate 
20840Sstevel@tonic-gate 	/*
20850Sstevel@tonic-gate 	 * At this point, the link is down, or the phyint is suspect,
20860Sstevel@tonic-gate 	 * as it has lost NUM_PROBE_FAILS or more probes. If the phyint
20870Sstevel@tonic-gate 	 * does not belong to any group, or is the only member of the
20880Sstevel@tonic-gate 	 * group capable of being probed, return PHYINT_FAILURE.
20890Sstevel@tonic-gate 	 */
20900Sstevel@tonic-gate 	alone = _B_TRUE;
20910Sstevel@tonic-gate 	if (pg != phyint_anongroup) {
20920Sstevel@tonic-gate 		for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
20930Sstevel@tonic-gate 			if (pi2 == pi)
20940Sstevel@tonic-gate 				continue;
20950Sstevel@tonic-gate 			if (PROBE_CAPABLE(pi2->pi_v4) ||
20960Sstevel@tonic-gate 			    PROBE_CAPABLE(pi2->pi_v6)) {
20970Sstevel@tonic-gate 				alone = _B_FALSE;
20980Sstevel@tonic-gate 				break;
20990Sstevel@tonic-gate 			}
21000Sstevel@tonic-gate 		}
21010Sstevel@tonic-gate 	}
21020Sstevel@tonic-gate 	if (alone)
21030Sstevel@tonic-gate 		return (PHYINT_FAILURE);
21040Sstevel@tonic-gate 
21050Sstevel@tonic-gate 	/*
21060Sstevel@tonic-gate 	 * Need to compare against other phyints of the same group
21070Sstevel@tonic-gate 	 * to exclude group failures. If the failure was detected via
21080Sstevel@tonic-gate 	 * probing, then if the time of last success (tls) of any
21090Sstevel@tonic-gate 	 * phyint is more recent than the time of first fail (tff) of the
21100Sstevel@tonic-gate 	 * phyint in question, and the link is up on the phyint,
21110Sstevel@tonic-gate 	 * then it is a phyint failure. Otherwise it is a group failure.
21120Sstevel@tonic-gate 	 * If failure was detected via a link down notification sent from
21130Sstevel@tonic-gate 	 * the driver to IP, we see if any phyints in the group are still
21140Sstevel@tonic-gate 	 * running and haven't received a link down notification.  We
21150Sstevel@tonic-gate 	 * will usually be processing the link down notification shortly
21160Sstevel@tonic-gate 	 * after it was received, so there is no point looking at the tls
21170Sstevel@tonic-gate 	 * of other phyints.
21180Sstevel@tonic-gate 	 */
21190Sstevel@tonic-gate 	for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
21200Sstevel@tonic-gate 		/* Exclude ourself from comparison */
21210Sstevel@tonic-gate 		if (pi2 == pi)
21220Sstevel@tonic-gate 			continue;
21230Sstevel@tonic-gate 
21240Sstevel@tonic-gate 		if (LINK_DOWN(pi)) {
21250Sstevel@tonic-gate 			/*
21260Sstevel@tonic-gate 			 * We use FLAGS_TO_LINK_STATE() to test the
21270Sstevel@tonic-gate 			 * flags directly, rather then LINK_UP() or
21280Sstevel@tonic-gate 			 * LINK_DOWN(), as we may not have got round
21290Sstevel@tonic-gate 			 * to processing the link state for the other
21300Sstevel@tonic-gate 			 * phyints in the group yet.
21310Sstevel@tonic-gate 			 *
21320Sstevel@tonic-gate 			 * The check for PI_RUNNING and group
21330Sstevel@tonic-gate 			 * failure handles the case when the
21340Sstevel@tonic-gate 			 * group begins to recover.  The first
21350Sstevel@tonic-gate 			 * phyint to recover should not trigger
21360Sstevel@tonic-gate 			 * a failover from the soon-to-recover
21370Sstevel@tonic-gate 			 * other phyints to the first recovered
21380Sstevel@tonic-gate 			 * phyint. PI_RUNNING will be set, and
21390Sstevel@tonic-gate 			 * pg_groupfailed cleared only after
21400Sstevel@tonic-gate 			 * receipt of NUM_PROBE_REPAIRS, by
21410Sstevel@tonic-gate 			 * which time the other phyints should
21420Sstevel@tonic-gate 			 * have received at least 1 packet,
21430Sstevel@tonic-gate 			 * and so will not have NUM_PROBE_FAILS.
21440Sstevel@tonic-gate 			 */
21450Sstevel@tonic-gate 			if ((pi2->pi_state == PI_RUNNING) &&
21460Sstevel@tonic-gate 			    !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2))
21470Sstevel@tonic-gate 				return (PHYINT_FAILURE);
21480Sstevel@tonic-gate 		} else {
21490Sstevel@tonic-gate 			/*
21500Sstevel@tonic-gate 			 * Need to compare against both IPv4 and
21510Sstevel@tonic-gate 			 * IPv6 instances.
21520Sstevel@tonic-gate 			 */
21530Sstevel@tonic-gate 			pii2 = pi2->pi_v4;
21540Sstevel@tonic-gate 			if (pii2 != NULL) {
21550Sstevel@tonic-gate 				probe_success_info(pii2, NULL, &psinfo);
21560Sstevel@tonic-gate 				if (psinfo.ps_tls_valid) {
21570Sstevel@tonic-gate 					pi2_tls = psinfo.ps_tls;
21580Sstevel@tonic-gate 					/*
21590Sstevel@tonic-gate 					 * See comment above regarding check
21600Sstevel@tonic-gate 					 * for PI_RUNNING and group failure.
21610Sstevel@tonic-gate 					 */
21620Sstevel@tonic-gate 					if (TIME_GT(pi2_tls, pi_tff) &&
21630Sstevel@tonic-gate 					    (pi2->pi_state == PI_RUNNING) &&
21640Sstevel@tonic-gate 					    !GROUP_FAILED(pg) &&
21650Sstevel@tonic-gate 					    FLAGS_TO_LINK_STATE(pi2))
21660Sstevel@tonic-gate 						return (PHYINT_FAILURE);
21670Sstevel@tonic-gate 				}
21680Sstevel@tonic-gate 			}
21690Sstevel@tonic-gate 
21700Sstevel@tonic-gate 			pii2 = pi2->pi_v6;
21710Sstevel@tonic-gate 			if (pii2 != NULL) {
21720Sstevel@tonic-gate 				probe_success_info(pii2, NULL, &psinfo);
21730Sstevel@tonic-gate 				if (psinfo.ps_tls_valid) {
21740Sstevel@tonic-gate 					pi2_tls = psinfo.ps_tls;
21750Sstevel@tonic-gate 					/*
21760Sstevel@tonic-gate 					 * See comment above regarding check
21770Sstevel@tonic-gate 					 * for PI_RUNNING and group failure.
21780Sstevel@tonic-gate 					 */
21790Sstevel@tonic-gate 					if (TIME_GT(pi2_tls, pi_tff) &&
21800Sstevel@tonic-gate 					    (pi2->pi_state == PI_RUNNING) &&
21810Sstevel@tonic-gate 					    !GROUP_FAILED(pg) &&
21820Sstevel@tonic-gate 					    FLAGS_TO_LINK_STATE(pi2))
21830Sstevel@tonic-gate 						return (PHYINT_FAILURE);
21840Sstevel@tonic-gate 				}
21850Sstevel@tonic-gate 			}
21860Sstevel@tonic-gate 		}
21870Sstevel@tonic-gate 	}
21880Sstevel@tonic-gate 
21890Sstevel@tonic-gate 	/*
21900Sstevel@tonic-gate 	 * Change the group state to PG_FAILED if it's not already.
21910Sstevel@tonic-gate 	 */
21920Sstevel@tonic-gate 	if (!GROUP_FAILED(pg))
21930Sstevel@tonic-gate 		phyint_group_chstate(pg, PG_FAILED);
21940Sstevel@tonic-gate 
21950Sstevel@tonic-gate 	return (GROUP_FAILURE);
21960Sstevel@tonic-gate }
21970Sstevel@tonic-gate 
21980Sstevel@tonic-gate /*
21990Sstevel@tonic-gate  * Return the information associated with consecutive probe successes
22000Sstevel@tonic-gate  * starting with the most recent probe. At most the last 2 probes can be
22010Sstevel@tonic-gate  * in the unacknowledged state. All previous probes have either failed
22020Sstevel@tonic-gate  * or succeeded.
22030Sstevel@tonic-gate  */
22040Sstevel@tonic-gate static void
22050Sstevel@tonic-gate probe_success_info(struct phyint_instance *pii, struct target *cur_tg,
22060Sstevel@tonic-gate     struct probe_success_count *psinfo)
22070Sstevel@tonic-gate {
22080Sstevel@tonic-gate 	uint_t	i;
22090Sstevel@tonic-gate 	struct probe_stats *pr_statp;
22100Sstevel@tonic-gate 	uint_t most_recent;
22110Sstevel@tonic-gate 	uint_t second_most_recent;
22120Sstevel@tonic-gate 	boolean_t pi_found_failure = _B_FALSE;
22130Sstevel@tonic-gate 	boolean_t tg_found_failure = _B_FALSE;
22140Sstevel@tonic-gate 	uint_t now;
22150Sstevel@tonic-gate 	uint_t timeout;
22160Sstevel@tonic-gate 	struct target *tg;
22170Sstevel@tonic-gate 
22180Sstevel@tonic-gate 	if (debug & D_FAILOVER)
22190Sstevel@tonic-gate 		logdebug("probe_success_info(%s)\n", pii->pii_name);
22200Sstevel@tonic-gate 
22210Sstevel@tonic-gate 	bzero(psinfo, sizeof (*psinfo));
22220Sstevel@tonic-gate 	now = getcurrenttime();
22230Sstevel@tonic-gate 
22240Sstevel@tonic-gate 	/*
22250Sstevel@tonic-gate 	 * Start with the most recent probe, and count the number
22260Sstevel@tonic-gate 	 * of consecutive probe successes. Latch the number of successes
22270Sstevel@tonic-gate 	 * on hitting a failure.
22280Sstevel@tonic-gate 	 */
22290Sstevel@tonic-gate 	most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
22300Sstevel@tonic-gate 	second_most_recent = PROBE_INDEX_PREV(most_recent);
22310Sstevel@tonic-gate 
22320Sstevel@tonic-gate 	for (i = most_recent; i != pii->pii_probe_next;
22330Sstevel@tonic-gate 	    i = PROBE_INDEX_PREV(i)) {
22340Sstevel@tonic-gate 		pr_statp = &pii->pii_probes[i];
22350Sstevel@tonic-gate 
22360Sstevel@tonic-gate 		switch (pr_statp->pr_status) {
22370Sstevel@tonic-gate 		case PR_UNACKED:
22380Sstevel@tonic-gate 			/*
22390Sstevel@tonic-gate 			 * Only the most recent 2 probes can be unacknowledged
22400Sstevel@tonic-gate 			 */
22410Sstevel@tonic-gate 			assert(i == most_recent || i == second_most_recent);
22420Sstevel@tonic-gate 
22430Sstevel@tonic-gate 			tg = pr_statp->pr_target;
22440Sstevel@tonic-gate 			assert(tg != NULL);
22450Sstevel@tonic-gate 			/*
22460Sstevel@tonic-gate 			 * The crtt could be zero for some reason,
22470Sstevel@tonic-gate 			 * Eg. the phyint could be failed. If the crtt is
22480Sstevel@tonic-gate 			 * not available use the value of the group's probe
22490Sstevel@tonic-gate 			 * interval which is a worst case estimate.
22500Sstevel@tonic-gate 			 */
22510Sstevel@tonic-gate 			if (tg->tg_crtt != 0) {
22520Sstevel@tonic-gate 				timeout = pr_statp->pr_time_sent + tg->tg_crtt;
22530Sstevel@tonic-gate 			} else {
22540Sstevel@tonic-gate 				timeout = pr_statp->pr_time_sent +
22550Sstevel@tonic-gate 				    pii->pii_phyint->pi_group->pg_probeint;
22560Sstevel@tonic-gate 			}
22570Sstevel@tonic-gate 
22580Sstevel@tonic-gate 			if (TIME_LT(timeout, now)) {
22590Sstevel@tonic-gate 				/*
22600Sstevel@tonic-gate 				 * We hit a failure. Latch the total number of
22610Sstevel@tonic-gate 				 * recent consecutive successes.
22620Sstevel@tonic-gate 				 */
22630Sstevel@tonic-gate 				pr_statp->pr_time_lost = timeout;
22640Sstevel@tonic-gate 				pr_statp->pr_status = PR_LOST;
22650Sstevel@tonic-gate 				pi_found_failure = _B_TRUE;
22660Sstevel@tonic-gate 				if (cur_tg != NULL && tg == cur_tg) {
22670Sstevel@tonic-gate 					/*
22680Sstevel@tonic-gate 					 * We hit a failure for the desired
22690Sstevel@tonic-gate 					 * target. Latch the number of recent
22700Sstevel@tonic-gate 					 * consecutive successes for this target
22710Sstevel@tonic-gate 					 */
22720Sstevel@tonic-gate 					tg_found_failure = _B_TRUE;
22730Sstevel@tonic-gate 				}
22740Sstevel@tonic-gate 			}
22750Sstevel@tonic-gate 			break;
22760Sstevel@tonic-gate 
22770Sstevel@tonic-gate 		case PR_ACKED:
22780Sstevel@tonic-gate 			/*
22790Sstevel@tonic-gate 			 * Bump up the count of probe successes, if we
22800Sstevel@tonic-gate 			 * have not seen any failure so far.
22810Sstevel@tonic-gate 			 */
22820Sstevel@tonic-gate 			if (!pi_found_failure)
22830Sstevel@tonic-gate 				psinfo->ps_nsucc++;
22840Sstevel@tonic-gate 
22850Sstevel@tonic-gate 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
22860Sstevel@tonic-gate 			    !tg_found_failure) {
22870Sstevel@tonic-gate 				psinfo->ps_nsucc_tg++;
22880Sstevel@tonic-gate 			}
22890Sstevel@tonic-gate 
22900Sstevel@tonic-gate 			/*
22910Sstevel@tonic-gate 			 * Record the time of last success, if this is
22920Sstevel@tonic-gate 			 * the most recent probe success.
22930Sstevel@tonic-gate 			 */
22940Sstevel@tonic-gate 			if (!psinfo->ps_tls_valid) {
22950Sstevel@tonic-gate 				psinfo->ps_tls = pr_statp->pr_time_acked;
22960Sstevel@tonic-gate 				psinfo->ps_tls_valid = _B_TRUE;
22970Sstevel@tonic-gate 			}
22980Sstevel@tonic-gate 			break;
22990Sstevel@tonic-gate 
23000Sstevel@tonic-gate 		case PR_LOST:
23010Sstevel@tonic-gate 			/*
23020Sstevel@tonic-gate 			 * We hit a failure. Latch the total number of
23030Sstevel@tonic-gate 			 * recent consecutive successes.
23040Sstevel@tonic-gate 			 */
23050Sstevel@tonic-gate 			pi_found_failure = _B_TRUE;
23060Sstevel@tonic-gate 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
23070Sstevel@tonic-gate 				/*
23080Sstevel@tonic-gate 				 * We hit a failure for the desired target.
23090Sstevel@tonic-gate 				 * Latch the number of recent consecutive
23100Sstevel@tonic-gate 				 * successes for this target
23110Sstevel@tonic-gate 				 */
23120Sstevel@tonic-gate 				tg_found_failure = _B_TRUE;
23130Sstevel@tonic-gate 			}
23140Sstevel@tonic-gate 			break;
23150Sstevel@tonic-gate 
23160Sstevel@tonic-gate 		default:
23170Sstevel@tonic-gate 			return;
23180Sstevel@tonic-gate 
23190Sstevel@tonic-gate 		}
23200Sstevel@tonic-gate 	}
23210Sstevel@tonic-gate }
23220Sstevel@tonic-gate 
23230Sstevel@tonic-gate /*
23240Sstevel@tonic-gate  * Return the information associated with consecutive probe failures
23250Sstevel@tonic-gate  * starting with the most recent probe. Only the last 2 probes can be in the
23260Sstevel@tonic-gate  * unacknowledged state. All previous probes have either failed or succeeded.
23270Sstevel@tonic-gate  */
23280Sstevel@tonic-gate static void
23290Sstevel@tonic-gate probe_fail_info(struct phyint_instance *pii, struct target *cur_tg,
23300Sstevel@tonic-gate     struct probe_fail_count *pfinfo)
23310Sstevel@tonic-gate {
23320Sstevel@tonic-gate 	int	i;
23330Sstevel@tonic-gate 	struct probe_stats *pr_statp;
23340Sstevel@tonic-gate 	boolean_t	tg_found_success = _B_FALSE;
23350Sstevel@tonic-gate 	boolean_t	pi_found_success = _B_FALSE;
23360Sstevel@tonic-gate 	int	most_recent;
23370Sstevel@tonic-gate 	int	second_most_recent;
23380Sstevel@tonic-gate 	uint_t	now;
23390Sstevel@tonic-gate 	uint_t	timeout;
23400Sstevel@tonic-gate 	struct	target *tg;
23410Sstevel@tonic-gate 
23420Sstevel@tonic-gate 	if (debug & D_FAILOVER)
23430Sstevel@tonic-gate 		logdebug("probe_fail_info(%s)\n", pii->pii_name);
23440Sstevel@tonic-gate 
23450Sstevel@tonic-gate 	bzero(pfinfo, sizeof (*pfinfo));
23460Sstevel@tonic-gate 	now = getcurrenttime();
23470Sstevel@tonic-gate 
23480Sstevel@tonic-gate 	/*
23490Sstevel@tonic-gate 	 * Start with the most recent probe, and count the number
23500Sstevel@tonic-gate 	 * of consecutive probe failures. Latch the number of failures
23510Sstevel@tonic-gate 	 * on hitting a probe success.
23520Sstevel@tonic-gate 	 */
23530Sstevel@tonic-gate 	most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
23540Sstevel@tonic-gate 	second_most_recent = PROBE_INDEX_PREV(most_recent);
23550Sstevel@tonic-gate 
23560Sstevel@tonic-gate 	for (i = most_recent; i != pii->pii_probe_next;
23570Sstevel@tonic-gate 	    i = PROBE_INDEX_PREV(i)) {
23580Sstevel@tonic-gate 		pr_statp = &pii->pii_probes[i];
23590Sstevel@tonic-gate 
23600Sstevel@tonic-gate 		assert(PR_STATUS_VALID(pr_statp->pr_status));
23610Sstevel@tonic-gate 
23620Sstevel@tonic-gate 		switch (pr_statp->pr_status) {
23630Sstevel@tonic-gate 		case PR_UNACKED:
23640Sstevel@tonic-gate 			/*
23650Sstevel@tonic-gate 			 * Only the most recent 2 probes can be unacknowledged
23660Sstevel@tonic-gate 			 */
23670Sstevel@tonic-gate 			assert(i == most_recent || i == second_most_recent);
23680Sstevel@tonic-gate 
23690Sstevel@tonic-gate 			tg = pr_statp->pr_target;
23700Sstevel@tonic-gate 			/*
23710Sstevel@tonic-gate 			 * Target is guaranteed to exist in the unack. state
23720Sstevel@tonic-gate 			 */
23730Sstevel@tonic-gate 			assert(tg != NULL);
23740Sstevel@tonic-gate 			/*
23750Sstevel@tonic-gate 			 * The crtt could be zero for some reason,
23760Sstevel@tonic-gate 			 * Eg. the phyint could be failed. If the crtt is
23770Sstevel@tonic-gate 			 * not available use the group's probe interval,
23780Sstevel@tonic-gate 			 * which is a worst case estimate.
23790Sstevel@tonic-gate 			 */
23800Sstevel@tonic-gate 			if (tg->tg_crtt != 0) {
23810Sstevel@tonic-gate 				timeout = pr_statp->pr_time_sent + tg->tg_crtt;
23820Sstevel@tonic-gate 			} else {
23830Sstevel@tonic-gate 				timeout = pr_statp->pr_time_sent +
23840Sstevel@tonic-gate 				    pii->pii_phyint->pi_group->pg_probeint;
23850Sstevel@tonic-gate 			}
23860Sstevel@tonic-gate 
23870Sstevel@tonic-gate 			if (TIME_GT(timeout, now))
23880Sstevel@tonic-gate 				break;
23890Sstevel@tonic-gate 
23900Sstevel@tonic-gate 			pr_statp->pr_time_lost = timeout;
23910Sstevel@tonic-gate 			pr_statp->pr_status = PR_LOST;
23920Sstevel@tonic-gate 			/* FALLTHRU */
23930Sstevel@tonic-gate 
23940Sstevel@tonic-gate 		case PR_LOST:
23950Sstevel@tonic-gate 			if (!pi_found_success) {
23960Sstevel@tonic-gate 				pfinfo->pf_nfail++;
23970Sstevel@tonic-gate 				pfinfo->pf_tff = pr_statp->pr_time_lost;
23980Sstevel@tonic-gate 			}
23990Sstevel@tonic-gate 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
24000Sstevel@tonic-gate 			    !tg_found_success)  {
24010Sstevel@tonic-gate 				pfinfo->pf_nfail_tg++;
24020Sstevel@tonic-gate 			}
24030Sstevel@tonic-gate 			break;
24040Sstevel@tonic-gate 
24050Sstevel@tonic-gate 		default:
24060Sstevel@tonic-gate 			/*
24070Sstevel@tonic-gate 			 * We hit a success or unused slot. Latch the
24080Sstevel@tonic-gate 			 * total number of recent consecutive failures.
24090Sstevel@tonic-gate 			 */
24100Sstevel@tonic-gate 			pi_found_success = _B_TRUE;
24110Sstevel@tonic-gate 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
24120Sstevel@tonic-gate 				/*
24130Sstevel@tonic-gate 				 * We hit a success for the desired target.
24140Sstevel@tonic-gate 				 * Latch the number of recent consecutive
24150Sstevel@tonic-gate 				 * failures for this target
24160Sstevel@tonic-gate 				 */
24170Sstevel@tonic-gate 				tg_found_success = _B_TRUE;
24180Sstevel@tonic-gate 			}
24190Sstevel@tonic-gate 		}
24200Sstevel@tonic-gate 	}
24210Sstevel@tonic-gate }
24220Sstevel@tonic-gate 
24230Sstevel@tonic-gate /*
24240Sstevel@tonic-gate  * Check if the phyint has been repaired.  If no test address has been
24250Sstevel@tonic-gate  * configured, then consider the interface repaired if the link is up (unless
24260Sstevel@tonic-gate  * the link is flapping; see below).  Otherwise, look for proof of probes
24270Sstevel@tonic-gate  * being sent and received. If last NUM_PROBE_REPAIRS probes are fine on
24280Sstevel@tonic-gate  * either IPv4 or IPv6 instance, the phyint can be considered repaired.
24290Sstevel@tonic-gate  */
24300Sstevel@tonic-gate static boolean_t
24310Sstevel@tonic-gate phyint_repaired(struct phyint *pi)
24320Sstevel@tonic-gate {
24330Sstevel@tonic-gate 	struct	probe_success_count psinfo;
24340Sstevel@tonic-gate 	struct	phyint_instance *pii;
24350Sstevel@tonic-gate 	struct	target *cur_tg;
24360Sstevel@tonic-gate 	int	pr_ndx;
24370Sstevel@tonic-gate 	uint_t	cur_time;
24380Sstevel@tonic-gate 
24390Sstevel@tonic-gate 	if (debug & D_FAILOVER)
24400Sstevel@tonic-gate 		logdebug("phyint_repaired(%s)\n", pi->pi_name);
24410Sstevel@tonic-gate 
24420Sstevel@tonic-gate 	if (LINK_DOWN(pi))
24430Sstevel@tonic-gate 		return (_B_FALSE);
24440Sstevel@tonic-gate 
24450Sstevel@tonic-gate 	/*
24460Sstevel@tonic-gate 	 * If we don't have any test addresses and the link is up, then
24470Sstevel@tonic-gate 	 * consider the interface repaired, unless we've received more than
24480Sstevel@tonic-gate 	 * LINK_UP_PERMIN link up notifications in the last minute, in
24490Sstevel@tonic-gate 	 * which case we keep the link down until we drop back below
24500Sstevel@tonic-gate 	 * the threshold.
24510Sstevel@tonic-gate 	 */
24520Sstevel@tonic-gate 	if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) {
24530Sstevel@tonic-gate 		cur_time = getcurrenttime();
24540Sstevel@tonic-gate 		if ((pi->pi_whenup[pi->pi_whendx] == 0 ||
24550Sstevel@tonic-gate 		    (cur_time - pi->pi_whenup[pi->pi_whendx]) > MSEC_PERMIN)) {
24560Sstevel@tonic-gate 			pi->pi_lfmsg_printed = 0;
24570Sstevel@tonic-gate 			return (_B_TRUE);
24580Sstevel@tonic-gate 		}
24590Sstevel@tonic-gate 		if (!pi->pi_lfmsg_printed) {
24600Sstevel@tonic-gate 			logerr("The link has come up on %s more than %d times "
24610Sstevel@tonic-gate 			    "in the last minute; disabling failback until it "
24620Sstevel@tonic-gate 			    "stabilizes\n", pi->pi_name, LINK_UP_PERMIN);
24630Sstevel@tonic-gate 			pi->pi_lfmsg_printed = 1;
24640Sstevel@tonic-gate 		}
24650Sstevel@tonic-gate 
24660Sstevel@tonic-gate 		return (_B_FALSE);
24670Sstevel@tonic-gate 	}
24680Sstevel@tonic-gate 
24690Sstevel@tonic-gate 	pii = pi->pi_v4;
24700Sstevel@tonic-gate 	if (PROBE_CAPABLE(pii)) {
24710Sstevel@tonic-gate 		pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
24720Sstevel@tonic-gate 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
24730Sstevel@tonic-gate 		probe_success_info(pii, cur_tg, &psinfo);
24740Sstevel@tonic-gate 		if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
24750Sstevel@tonic-gate 		    psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
24760Sstevel@tonic-gate 			return (_B_TRUE);
24770Sstevel@tonic-gate 	}
24780Sstevel@tonic-gate 
24790Sstevel@tonic-gate 	pii = pi->pi_v6;
24800Sstevel@tonic-gate 	if (PROBE_CAPABLE(pii)) {
24810Sstevel@tonic-gate 		pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
24820Sstevel@tonic-gate 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
24830Sstevel@tonic-gate 		probe_success_info(pii, cur_tg, &psinfo);
24840Sstevel@tonic-gate 		if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
24850Sstevel@tonic-gate 		    psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
24860Sstevel@tonic-gate 			return (_B_TRUE);
24870Sstevel@tonic-gate 	}
24880Sstevel@tonic-gate 
24890Sstevel@tonic-gate 	return (_B_FALSE);
24900Sstevel@tonic-gate }
24910Sstevel@tonic-gate 
24920Sstevel@tonic-gate /*
24930Sstevel@tonic-gate  * Try failover from phyint 'pi' to a suitable destination.
24940Sstevel@tonic-gate  */
24950Sstevel@tonic-gate int
24960Sstevel@tonic-gate try_failover(struct phyint *pi, int failover_type)
24970Sstevel@tonic-gate {
24980Sstevel@tonic-gate 	struct phyint *dst;
24990Sstevel@tonic-gate 	int err;
25000Sstevel@tonic-gate 
25010Sstevel@tonic-gate 	if (debug & D_FAILOVER)
25020Sstevel@tonic-gate 		logdebug("try_failover(%s %d)\n", pi->pi_name, failover_type);
25030Sstevel@tonic-gate 
25040Sstevel@tonic-gate 	/*
25050Sstevel@tonic-gate 	 * Attempt to find a failover destination 'dst'.
25060Sstevel@tonic-gate 	 * dst will be null if any of the following is true
25070Sstevel@tonic-gate 	 * Phyint is not part of a group  OR
25080Sstevel@tonic-gate 	 * Phyint is the only member of a group OR
25090Sstevel@tonic-gate 	 * No suitable failover dst was available
25100Sstevel@tonic-gate 	 */
25110Sstevel@tonic-gate 	dst = get_failover_dst(pi, failover_type);
25120Sstevel@tonic-gate 	if (dst == NULL)
25130Sstevel@tonic-gate 		return (IPMP_EMINRED);
25140Sstevel@tonic-gate 
25150Sstevel@tonic-gate 	dst->pi_empty = 0;			/* Per state diagram */
25160Sstevel@tonic-gate 	pi->pi_full = 0;			/* Per state diagram */
25170Sstevel@tonic-gate 
25180Sstevel@tonic-gate 	err = failover(pi, dst);
25190Sstevel@tonic-gate 
25200Sstevel@tonic-gate 	if (debug & D_FAILOVER) {
25210Sstevel@tonic-gate 		logdebug("failed over from %s to %s ret %d\n",
25220Sstevel@tonic-gate 		    pi->pi_name, dst->pi_name, err);
25230Sstevel@tonic-gate 	}
25240Sstevel@tonic-gate 	if (err == 0) {
25250Sstevel@tonic-gate 		pi->pi_empty = 1;		/* Per state diagram */
25260Sstevel@tonic-gate 		/*
25270Sstevel@tonic-gate 		 * we don't want to print out this message if a
25280Sstevel@tonic-gate 		 * phyint is leaving the group, nor for failover from
25290Sstevel@tonic-gate 		 * standby
25300Sstevel@tonic-gate 		 */
25310Sstevel@tonic-gate 		if (failover_type == FAILOVER_NORMAL) {
25320Sstevel@tonic-gate 			logerr("Successfully failed over from NIC %s to NIC "
25330Sstevel@tonic-gate 			    "%s\n", pi->pi_name, dst->pi_name);
25340Sstevel@tonic-gate 		}
25350Sstevel@tonic-gate 		return (0);
25360Sstevel@tonic-gate 	} else {
25370Sstevel@tonic-gate 		/*
25380Sstevel@tonic-gate 		 * The failover did not succeed. We must retry the failover
25390Sstevel@tonic-gate 		 * only after resyncing our state based on the kernel's.
25400Sstevel@tonic-gate 		 * For eg. either the src or the dst might have been unplumbed
25410Sstevel@tonic-gate 		 * causing this failure. initifs() will be called again,
25420Sstevel@tonic-gate 		 * from main, since full_scan_required has been set to true
25430Sstevel@tonic-gate 		 * by failover();
25440Sstevel@tonic-gate 		 */
25450Sstevel@tonic-gate 		return (IPMP_FAILURE);
25460Sstevel@tonic-gate 	}
25470Sstevel@tonic-gate }
25480Sstevel@tonic-gate 
25490Sstevel@tonic-gate /*
25500Sstevel@tonic-gate  * global_errno captures the errno value, if failover() or failback()
25510Sstevel@tonic-gate  * fails. This is sent to if_mpadm(1M).
25520Sstevel@tonic-gate  */
25530Sstevel@tonic-gate int global_errno;
25540Sstevel@tonic-gate 
25550Sstevel@tonic-gate /*
25560Sstevel@tonic-gate  * Attempt failover from phyint 'from' to phyint 'to'.
25570Sstevel@tonic-gate  * IP moves everything from phyint 'from' to phyint 'to'.
25580Sstevel@tonic-gate  */
25590Sstevel@tonic-gate static int
25600Sstevel@tonic-gate failover(struct phyint *from, struct phyint *to)
25610Sstevel@tonic-gate {
25620Sstevel@tonic-gate 	struct	lifreq	lifr;
25630Sstevel@tonic-gate 	int 	ret;
25640Sstevel@tonic-gate 
25650Sstevel@tonic-gate 	if (debug & D_FAILOVER) {
25660Sstevel@tonic-gate 		logdebug("failing over from %s to %s\n",
25670Sstevel@tonic-gate 		    from->pi_name, to->pi_name);
25680Sstevel@tonic-gate 	}
25690Sstevel@tonic-gate 
25700Sstevel@tonic-gate 	/*
25710Sstevel@tonic-gate 	 * Perform the failover. Both IPv4 and IPv6 are failed over
25720Sstevel@tonic-gate 	 * using a single ioctl by passing in AF_UNSPEC family.
25730Sstevel@tonic-gate 	 */
25740Sstevel@tonic-gate 	lifr.lifr_addr.ss_family = AF_UNSPEC;
25750Sstevel@tonic-gate 	(void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name));
25760Sstevel@tonic-gate 	lifr.lifr_movetoindex = to->pi_ifindex;
25770Sstevel@tonic-gate 
25780Sstevel@tonic-gate 	ret = ioctl(ifsock_v4, SIOCLIFFAILOVER, (caddr_t)&lifr);
25790Sstevel@tonic-gate 	if (ret < 0) {
25800Sstevel@tonic-gate 		global_errno = errno;
25810Sstevel@tonic-gate 		logperror("failover: ioctl (failover)");
25820Sstevel@tonic-gate 	}
25830Sstevel@tonic-gate 
25840Sstevel@tonic-gate 	/*
25850Sstevel@tonic-gate 	 * Set full_scan_required to true. This will make us read
25860Sstevel@tonic-gate 	 * the state from the kernel in initifs() and update our tables,
25870Sstevel@tonic-gate 	 * to reflect the current state after the failover. If the
25880Sstevel@tonic-gate 	 * failover has failed it will then reissue the failover.
25890Sstevel@tonic-gate 	 */
25900Sstevel@tonic-gate 	full_scan_required = _B_TRUE;
25910Sstevel@tonic-gate 	return (ret);
25920Sstevel@tonic-gate }
25930Sstevel@tonic-gate 
25940Sstevel@tonic-gate /*
25950Sstevel@tonic-gate  * phyint 'pi' has recovered. Attempt failback from every phyint in the same
25960Sstevel@tonic-gate  * group as phyint 'pi' that is a potential failback source, to phyint 'pi'.
25970Sstevel@tonic-gate  * Return values:
25980Sstevel@tonic-gate  * IPMP_SUCCESS:		Failback successful from each of the other
25990Sstevel@tonic-gate  *				phyints in the group.
26000Sstevel@tonic-gate  * IPMP_EFBPARTIAL: 		Failback successful from some of the other
26010Sstevel@tonic-gate  *				phyints in the group.
26020Sstevel@tonic-gate  * IPMP_FAILURE:		Failback syscall failed with some error.
26030Sstevel@tonic-gate  *
26040Sstevel@tonic-gate  * Note that failback is attempted regardless of the setting of the
26050Sstevel@tonic-gate  * failback_enabled flag.
26060Sstevel@tonic-gate  */
26070Sstevel@tonic-gate int
26082496Smeem do_failback(struct phyint *pi)
26090Sstevel@tonic-gate {
26100Sstevel@tonic-gate 	struct  phyint *from;
26110Sstevel@tonic-gate 	boolean_t done;
26120Sstevel@tonic-gate 	boolean_t partial;
26130Sstevel@tonic-gate 	boolean_t attempted_failback = _B_FALSE;
26140Sstevel@tonic-gate 
26150Sstevel@tonic-gate 	if (debug & D_FAILOVER)
26160Sstevel@tonic-gate 		logdebug("do_failback(%s)\n", pi->pi_name);
26170Sstevel@tonic-gate 
26180Sstevel@tonic-gate 	/* If this phyint is not part of a named group, return. */
26190Sstevel@tonic-gate 	if (pi->pi_group == phyint_anongroup) {
26200Sstevel@tonic-gate 		pi->pi_full = 1;
26210Sstevel@tonic-gate 		return (IPMP_SUCCESS);
26220Sstevel@tonic-gate 	}
26230Sstevel@tonic-gate 
26240Sstevel@tonic-gate 	/*
26250Sstevel@tonic-gate 	 * Attempt failback from every phyint in the group to 'pi'.
26260Sstevel@tonic-gate 	 * The reason for doing this, instead of only from the
26270Sstevel@tonic-gate 	 * phyint to which we did the failover is given below.
26280Sstevel@tonic-gate 	 *
26290Sstevel@tonic-gate 	 * After 'pi' failed, if any app. tries to join on a multicast
26300Sstevel@tonic-gate 	 * address (IPv6), on the failed phyint, IP picks any arbitrary
26310Sstevel@tonic-gate 	 * non-failed phyint in the group, instead of the failed phyint,
26320Sstevel@tonic-gate 	 * in.mpathd is not aware of this. Thus failing back only from the
26330Sstevel@tonic-gate 	 * interface to which 'pi' failed over, will failback the ipif's
26340Sstevel@tonic-gate 	 * but not the ilm's. So we need to failback from all members of
26350Sstevel@tonic-gate 	 * the phyint group
26360Sstevel@tonic-gate 	 */
26370Sstevel@tonic-gate 	done = _B_TRUE;
26380Sstevel@tonic-gate 	partial = _B_FALSE;
26390Sstevel@tonic-gate 	for (from = pi->pi_group->pg_phyint; from != NULL;
26400Sstevel@tonic-gate 	    from = from->pi_pgnext) {
26410Sstevel@tonic-gate 		/* Exclude ourself as a failback src */
26420Sstevel@tonic-gate 		if (from == pi)
26430Sstevel@tonic-gate 			continue;
26440Sstevel@tonic-gate 
26450Sstevel@tonic-gate 		/*
26460Sstevel@tonic-gate 		 * If the 'from' phyint has IPv4 plumbed, the 'to'
26470Sstevel@tonic-gate 		 * phyint must also have IPv4 plumbed. Similar check
26480Sstevel@tonic-gate 		 * for IPv6. IP makes the same check. Otherwise the
26490Sstevel@tonic-gate 		 * failback will fail.
26500Sstevel@tonic-gate 		 */
26510Sstevel@tonic-gate 		if ((from->pi_v4 != NULL && pi->pi_v4 == NULL) ||
26520Sstevel@tonic-gate 		    (from->pi_v6 != NULL && pi->pi_v6 == NULL)) {
26530Sstevel@tonic-gate 			partial = _B_TRUE;
26540Sstevel@tonic-gate 			continue;
26550Sstevel@tonic-gate 		}
26560Sstevel@tonic-gate 
26572496Smeem 		pi->pi_empty = 0;	/* Per state diagram */
26582496Smeem 		attempted_failback = _B_TRUE;
26592496Smeem 		if (failback(from, pi) != 0) {
26602496Smeem 			done = _B_FALSE;
26612496Smeem 			break;
26620Sstevel@tonic-gate 		}
26630Sstevel@tonic-gate 	}
26640Sstevel@tonic-gate 
26650Sstevel@tonic-gate 	/*
26660Sstevel@tonic-gate 	 * We are done. No more phyint from which we can src the failback
26670Sstevel@tonic-gate 	 */
26680Sstevel@tonic-gate 	if (done) {
26690Sstevel@tonic-gate 		if (!partial)
26700Sstevel@tonic-gate 			pi->pi_full = 1;	/* Per state diagram */
26710Sstevel@tonic-gate 		/*
26720Sstevel@tonic-gate 		 * Don't print out a message unless there is a
26730Sstevel@tonic-gate 		 * transition from FAILED to RUNNING. For eg.
26740Sstevel@tonic-gate 		 * we don't want to print out this message if a
26750Sstevel@tonic-gate 		 * phyint is leaving the group, or at startup
26760Sstevel@tonic-gate 		 */
26770Sstevel@tonic-gate 		if (attempted_failback && (pi->pi_flags &
26780Sstevel@tonic-gate 		    (IFF_FAILED | IFF_OFFLINE))) {
26790Sstevel@tonic-gate 			logerr("Successfully failed back to NIC %s\n",
26800Sstevel@tonic-gate 			    pi->pi_name);
26810Sstevel@tonic-gate 		}
26820Sstevel@tonic-gate 		return (partial ? IPMP_EFBPARTIAL : IPMP_SUCCESS);
26830Sstevel@tonic-gate 	}
26840Sstevel@tonic-gate 
26850Sstevel@tonic-gate 	return (IPMP_FAILURE);
26860Sstevel@tonic-gate }
26870Sstevel@tonic-gate 
26880Sstevel@tonic-gate /*
26890Sstevel@tonic-gate  * This function is similar to do_failback() above, but respects the
26900Sstevel@tonic-gate  * failback_enabled flag for phyints in named groups.
26910Sstevel@tonic-gate  */
26920Sstevel@tonic-gate int
26932496Smeem try_failback(struct phyint *pi)
26940Sstevel@tonic-gate {
26950Sstevel@tonic-gate 	if (debug & D_FAILOVER)
26960Sstevel@tonic-gate 		logdebug("try_failback(%s)\n", pi->pi_name);
26970Sstevel@tonic-gate 
26980Sstevel@tonic-gate 	if (pi->pi_group != phyint_anongroup && !failback_enabled)
26990Sstevel@tonic-gate 		return (IPMP_EFBDISABLED);
27000Sstevel@tonic-gate 
27012496Smeem 	return (do_failback(pi));
27020Sstevel@tonic-gate }
27030Sstevel@tonic-gate 
27040Sstevel@tonic-gate /*
27050Sstevel@tonic-gate  * Failback everything from phyint 'from' that has the same ifindex
27060Sstevel@tonic-gate  * as phyint to's ifindex.
27070Sstevel@tonic-gate  */
27080Sstevel@tonic-gate static int
27090Sstevel@tonic-gate failback(struct phyint *from, struct phyint *to)
27100Sstevel@tonic-gate {
27110Sstevel@tonic-gate 	struct lifreq lifr;
27120Sstevel@tonic-gate 	int ret;
27130Sstevel@tonic-gate 
27140Sstevel@tonic-gate 	if (debug & D_FAILOVER)
27150Sstevel@tonic-gate 		logdebug("failback(%s %s)\n", from->pi_name, to->pi_name);
27160Sstevel@tonic-gate 
27170Sstevel@tonic-gate 	lifr.lifr_addr.ss_family = AF_UNSPEC;
27180Sstevel@tonic-gate 	(void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name));
27190Sstevel@tonic-gate 	lifr.lifr_movetoindex = to->pi_ifindex;
27200Sstevel@tonic-gate 
27210Sstevel@tonic-gate 	ret = ioctl(ifsock_v4, SIOCLIFFAILBACK, (caddr_t)&lifr);
27220Sstevel@tonic-gate 	if (ret < 0) {
27230Sstevel@tonic-gate 		global_errno = errno;
27240Sstevel@tonic-gate 		logperror("failback: ioctl (failback)");
27250Sstevel@tonic-gate 	}
27260Sstevel@tonic-gate 
27270Sstevel@tonic-gate 	/*
27280Sstevel@tonic-gate 	 * Set full_scan_required to true. This will make us read
27290Sstevel@tonic-gate 	 * the state from the kernel in initifs() and update our tables,
27300Sstevel@tonic-gate 	 * to reflect the current state after the failback. If the
27310Sstevel@tonic-gate 	 * failback has failed it will then reissue the failback.
27320Sstevel@tonic-gate 	 */
27330Sstevel@tonic-gate 	full_scan_required = _B_TRUE;
27340Sstevel@tonic-gate 
27350Sstevel@tonic-gate 	return (ret);
27360Sstevel@tonic-gate }
27370Sstevel@tonic-gate 
27380Sstevel@tonic-gate /*
27390Sstevel@tonic-gate  * Select a target phyint for failing over from 'pi'.
27400Sstevel@tonic-gate  * In the normal case i.e. failover_type is FAILOVER_NORMAL, the preferred
27410Sstevel@tonic-gate  * target phyint is chosen as follows,
27420Sstevel@tonic-gate  *	1. Pick any inactive standby interface.
27430Sstevel@tonic-gate  *	2. If no inactive standby is available, select any phyint in the
27440Sstevel@tonic-gate  *	   same group that has the least number of logints, (excluding
27450Sstevel@tonic-gate  *	   IFF_NOFAILOVER and !IFF_UP logints)
27460Sstevel@tonic-gate  * If we are failing over from a standby, failover_type is
27470Sstevel@tonic-gate  * FAILOVER_TO_NONSTANDBY, and we won't pick a standby for the destination.
27480Sstevel@tonic-gate  * If a phyint is leaving the group, then failover_type is FAILOVER_TO_ANY,
27490Sstevel@tonic-gate  * and we won't return NULL, as long as there is at least 1 other phyint
27500Sstevel@tonic-gate  * in the group.
27510Sstevel@tonic-gate  */
27520Sstevel@tonic-gate static struct phyint *
27530Sstevel@tonic-gate get_failover_dst(struct phyint *pi, int failover_type)
27540Sstevel@tonic-gate {
27550Sstevel@tonic-gate 	struct phyint	*maybe = NULL;
27560Sstevel@tonic-gate 	struct phyint	*pi2;
27570Sstevel@tonic-gate 	struct phyint 	*last_choice = NULL;
27580Sstevel@tonic-gate 
27590Sstevel@tonic-gate 	if (pi->pi_group == phyint_anongroup)
27600Sstevel@tonic-gate 		return (NULL);
27610Sstevel@tonic-gate 
27620Sstevel@tonic-gate 	/*
27630Sstevel@tonic-gate 	 * Loop thru the phyints in the group, and pick the preferred
27640Sstevel@tonic-gate 	 * phyint for the target.
27650Sstevel@tonic-gate 	 */
27660Sstevel@tonic-gate 	for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
27670Sstevel@tonic-gate 		/* Exclude ourself and offlined interfaces */
27680Sstevel@tonic-gate 		if (pi2 == pi || pi2->pi_state == PI_OFFLINE)
27690Sstevel@tonic-gate 			continue;
27700Sstevel@tonic-gate 
27710Sstevel@tonic-gate 		/*
27720Sstevel@tonic-gate 		 * The chosen target phyint must have IPv4 instance
27730Sstevel@tonic-gate 		 * plumbed, if the src phyint has IPv4 plumbed. Similarly
27740Sstevel@tonic-gate 		 * for IPv6.
27750Sstevel@tonic-gate 		 */
27760Sstevel@tonic-gate 		if ((pi2->pi_v4 == NULL && pi->pi_v4 != NULL) ||
27770Sstevel@tonic-gate 		    (pi2->pi_v6 == NULL && pi->pi_v6 != NULL))
27780Sstevel@tonic-gate 			continue;
27790Sstevel@tonic-gate 
27800Sstevel@tonic-gate 		/* The chosen target must be PI_RUNNING. */
27810Sstevel@tonic-gate 		if (pi2->pi_state != PI_RUNNING) {
27820Sstevel@tonic-gate 			last_choice = pi2;
27830Sstevel@tonic-gate 			continue;
27840Sstevel@tonic-gate 		}
27850Sstevel@tonic-gate 
2786704Sethindra 		if ((pi2->pi_flags & (IFF_STANDBY | IFF_INACTIVE)) &&
27870Sstevel@tonic-gate 		    (failover_type != FAILOVER_TO_NONSTANDBY)) {
27880Sstevel@tonic-gate 			return (pi2);
27890Sstevel@tonic-gate 		} else {
27900Sstevel@tonic-gate 			if (maybe == NULL)
27910Sstevel@tonic-gate 				maybe = pi2;
27920Sstevel@tonic-gate 			else if (logint_upcount(pi2) < logint_upcount(maybe))
27930Sstevel@tonic-gate 				maybe = pi2;
27940Sstevel@tonic-gate 		}
27950Sstevel@tonic-gate 	}
27960Sstevel@tonic-gate 	if (maybe == NULL && failover_type == FAILOVER_TO_ANY)
27970Sstevel@tonic-gate 		return (last_choice);
27980Sstevel@tonic-gate 	else
27990Sstevel@tonic-gate 		return (maybe);
28000Sstevel@tonic-gate }
28010Sstevel@tonic-gate 
28020Sstevel@tonic-gate /*
28030Sstevel@tonic-gate  * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call.
28040Sstevel@tonic-gate  */
28050Sstevel@tonic-gate boolean_t
28060Sstevel@tonic-gate change_lif_flags(struct phyint *pi, uint64_t flags, boolean_t setfl)
28070Sstevel@tonic-gate {
28080Sstevel@tonic-gate 	int ifsock;
28090Sstevel@tonic-gate 	struct lifreq lifr;
2810*4929Srk129064 	uint64_t old_flags;
28110Sstevel@tonic-gate 
28120Sstevel@tonic-gate 	if (debug & D_FAILOVER) {
28130Sstevel@tonic-gate 		logdebug("change_lif_flags(%s): flags %llx setfl %d\n",
28140Sstevel@tonic-gate 		    pi->pi_name, flags, (int)setfl);
28150Sstevel@tonic-gate 	}
28160Sstevel@tonic-gate 
28170Sstevel@tonic-gate 	if (pi->pi_v4 != NULL) {
28180Sstevel@tonic-gate 		ifsock = ifsock_v4;
28190Sstevel@tonic-gate 	} else  {
28200Sstevel@tonic-gate 		ifsock = ifsock_v6;
28210Sstevel@tonic-gate 	}
28220Sstevel@tonic-gate 
28230Sstevel@tonic-gate 	/*
28240Sstevel@tonic-gate 	 * Get the current flags from the kernel, and set/clear the
28250Sstevel@tonic-gate 	 * desired phyint flags. Since we set only phyint flags, we can
28260Sstevel@tonic-gate 	 * do it on either IPv4 or IPv6 instance.
28270Sstevel@tonic-gate 	 */
28280Sstevel@tonic-gate 	(void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
28290Sstevel@tonic-gate 	lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0';
28300Sstevel@tonic-gate 	if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
28310Sstevel@tonic-gate 		if (errno != ENXIO)
28320Sstevel@tonic-gate 			logperror("change_lif_flags: ioctl (get flags)");
28330Sstevel@tonic-gate 		return (_B_FALSE);
28340Sstevel@tonic-gate 	}
2835*4929Srk129064 
2836*4929Srk129064 	old_flags = lifr.lifr_flags;
28370Sstevel@tonic-gate 	if (setfl)
28380Sstevel@tonic-gate 		lifr.lifr_flags |= flags;
28390Sstevel@tonic-gate 	else
28400Sstevel@tonic-gate 		lifr.lifr_flags &= ~flags;
2841*4929Srk129064 
2842*4929Srk129064 	if (old_flags == lifr.lifr_flags) {
2843*4929Srk129064 		/* No change in the flags. No need to send ioctl */
2844*4929Srk129064 		return (_B_TRUE);
2845*4929Srk129064 	}
2846*4929Srk129064 
28470Sstevel@tonic-gate 	if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) {
28480Sstevel@tonic-gate 		if (errno != ENXIO)
28490Sstevel@tonic-gate 			logperror("change_lif_flags: ioctl (set flags)");
28500Sstevel@tonic-gate 		return (_B_FALSE);
28510Sstevel@tonic-gate 	}
28520Sstevel@tonic-gate 
28530Sstevel@tonic-gate 	/*
28540Sstevel@tonic-gate 	 * Keep pi_flags in synch. with actual flags. Assumes flags are
28550Sstevel@tonic-gate 	 * phyint flags.
28560Sstevel@tonic-gate 	 */
28570Sstevel@tonic-gate 	if (setfl)
28580Sstevel@tonic-gate 		pi->pi_flags |= flags;
28590Sstevel@tonic-gate 	else
28600Sstevel@tonic-gate 		pi->pi_flags &= ~flags;
28610Sstevel@tonic-gate 
28620Sstevel@tonic-gate 	if (pi->pi_v4)
28630Sstevel@tonic-gate 		pi->pi_v4->pii_flags = pi->pi_flags;
28640Sstevel@tonic-gate 
28650Sstevel@tonic-gate 	if (pi->pi_v6)
28660Sstevel@tonic-gate 		pi->pi_v6->pii_flags = pi->pi_flags;
28670Sstevel@tonic-gate 
28680Sstevel@tonic-gate 	return (_B_TRUE);
28690Sstevel@tonic-gate }
28700Sstevel@tonic-gate 
28710Sstevel@tonic-gate /*
28720Sstevel@tonic-gate  * icmp cksum computation for IPv4.
28730Sstevel@tonic-gate  */
28740Sstevel@tonic-gate static int
28750Sstevel@tonic-gate in_cksum(ushort_t *addr, int len)
28760Sstevel@tonic-gate {
28770Sstevel@tonic-gate 	register int nleft = len;
28780Sstevel@tonic-gate 	register ushort_t *w = addr;
28790Sstevel@tonic-gate 	register ushort_t answer;
28800Sstevel@tonic-gate 	ushort_t odd_byte = 0;
28810Sstevel@tonic-gate 	register int sum = 0;
28820Sstevel@tonic-gate 
28830Sstevel@tonic-gate 	/*
28840Sstevel@tonic-gate 	 *  Our algorithm is simple, using a 32 bit accumulator (sum),
28850Sstevel@tonic-gate 	 *  we add sequential 16 bit words to it, and at the end, fold
28860Sstevel@tonic-gate 	 *  back all the carry bits from the top 16 bits into the lower
28870Sstevel@tonic-gate 	 *  16 bits.
28880Sstevel@tonic-gate 	 */
28890Sstevel@tonic-gate 	while (nleft > 1)  {
28900Sstevel@tonic-gate 		sum += *w++;
28910Sstevel@tonic-gate 		nleft -= 2;
28920Sstevel@tonic-gate 	}
28930Sstevel@tonic-gate 
28940Sstevel@tonic-gate 	/* mop up an odd byte, if necessary */
28950Sstevel@tonic-gate 	if (nleft == 1) {
28960Sstevel@tonic-gate 		*(uchar_t *)(&odd_byte) = *(uchar_t *)w;
28970Sstevel@tonic-gate 		sum += odd_byte;
28980Sstevel@tonic-gate 	}
28990Sstevel@tonic-gate 
29000Sstevel@tonic-gate 	/*
29010Sstevel@tonic-gate 	 * add back carry outs from top 16 bits to low 16 bits
29020Sstevel@tonic-gate 	 */
29030Sstevel@tonic-gate 	sum = (sum >> 16) + (sum & 0xffff);	/* add hi 16 to low 16 */
29040Sstevel@tonic-gate 	sum += (sum >> 16);			/* add carry */
29050Sstevel@tonic-gate 	answer = ~sum;				/* truncate to 16 bits */
29060Sstevel@tonic-gate 	return (answer);
29070Sstevel@tonic-gate }
29080Sstevel@tonic-gate 
29090Sstevel@tonic-gate static void
29100Sstevel@tonic-gate reset_snxt_basetimes(void)
29110Sstevel@tonic-gate {
29120Sstevel@tonic-gate 	struct phyint_instance *pii;
29130Sstevel@tonic-gate 
29140Sstevel@tonic-gate 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
29150Sstevel@tonic-gate 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
29160Sstevel@tonic-gate 	}
29170Sstevel@tonic-gate }
29180Sstevel@tonic-gate 
29190Sstevel@tonic-gate /*
29200Sstevel@tonic-gate  * Is the address one of our own addresses? Unfortunately,
29210Sstevel@tonic-gate  * we cannot check our phyint tables to determine if the address
29220Sstevel@tonic-gate  * is our own. This is because, we don't track interfaces that
29230Sstevel@tonic-gate  * are not part of any group. We have to either use a 'bind' or
29240Sstevel@tonic-gate  * get the complete list of all interfaces using SIOCGLIFCONF,
29252250Srk129064  * to do this check. We could also use SIOCTMYADDR.
29262250Srk129064  * Bind fails for the local zone address, so we might include local zone
29272250Srk129064  * address as target address. If local zone address is a target address
29282250Srk129064  * and it is up, it is not possible to detect the interface failure.
29292250Srk129064  * SIOCTMYADDR also doesn't consider local zone address as own address.
29302250Srk129064  * So, we choose to use SIOCGLIFCONF to collect the local addresses, and they
29312250Srk129064  * are stored in laddr_list.
29320Sstevel@tonic-gate  */
29330Sstevel@tonic-gate 
29342250Srk129064 boolean_t
29352250Srk129064 own_address(struct in6_addr addr)
29362250Srk129064 {
29372250Srk129064 	struct local_addr *taddr = laddr_list;
29382250Srk129064 
29392250Srk129064 	for (; taddr != NULL; taddr = taddr->next) {
29402250Srk129064 		if (IN6_ARE_ADDR_EQUAL(&addr, &taddr->addr)) {
29412250Srk129064 			return (_B_TRUE);
29420Sstevel@tonic-gate 		}
29430Sstevel@tonic-gate 	}
29442250Srk129064 	return (_B_FALSE);
29450Sstevel@tonic-gate }
2946