xref: /onnv-gate/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_probe.c (revision 10649:ab3ce9d83b84)
10Sstevel@tonic-gate /*
28485SPeter.Memishian@Sun.COM  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
30Sstevel@tonic-gate  * Use is subject to license terms.
40Sstevel@tonic-gate  */
50Sstevel@tonic-gate 
60Sstevel@tonic-gate /*
70Sstevel@tonic-gate  * Copyright (c) 1987 Regents of the University of California.
80Sstevel@tonic-gate  * All rights reserved.
90Sstevel@tonic-gate  *
100Sstevel@tonic-gate  * Redistribution and use in source and binary forms are permitted
110Sstevel@tonic-gate  * provided that the above copyright notice and this paragraph are
120Sstevel@tonic-gate  * duplicated in all such forms and that any documentation,
130Sstevel@tonic-gate  * advertising materials, and other materials related to such
140Sstevel@tonic-gate  * distribution and use acknowledge that the software was developed
150Sstevel@tonic-gate  * by the University of California, Berkeley. The name of the
160Sstevel@tonic-gate  * University may not be used to endorse or promote products derived
170Sstevel@tonic-gate  * from this software without specific prior written permission.
180Sstevel@tonic-gate  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR
190Sstevel@tonic-gate  * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED
200Sstevel@tonic-gate  * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
210Sstevel@tonic-gate  */
220Sstevel@tonic-gate 
230Sstevel@tonic-gate #include "mpd_defs.h"
240Sstevel@tonic-gate #include "mpd_tables.h"
250Sstevel@tonic-gate 
260Sstevel@tonic-gate /*
270Sstevel@tonic-gate  * Probe types for probe()
280Sstevel@tonic-gate  */
290Sstevel@tonic-gate #define	PROBE_UNI	0x1234		/* Unicast probe packet */
300Sstevel@tonic-gate #define	PROBE_MULTI	0x5678		/* Multicast probe packet */
310Sstevel@tonic-gate #define	PROBE_RTT	0x9abc		/* RTT only probe packet */
320Sstevel@tonic-gate 
330Sstevel@tonic-gate #define	MSEC_PERMIN	(60 * MILLISEC)	/* Number of milliseconds in a minute */
340Sstevel@tonic-gate 
350Sstevel@tonic-gate /*
360Sstevel@tonic-gate  * Format of probe / probe response packets. This is an ICMP Echo request
370Sstevel@tonic-gate  * or ICMP Echo reply. Packet format is same for both IPv4 and IPv6
380Sstevel@tonic-gate  */
390Sstevel@tonic-gate struct pr_icmp
400Sstevel@tonic-gate {
410Sstevel@tonic-gate 	uint8_t  pr_icmp_type;		/* type field */
420Sstevel@tonic-gate 	uint8_t  pr_icmp_code;		/* code field */
430Sstevel@tonic-gate 	uint16_t pr_icmp_cksum;		/* checksum field */
440Sstevel@tonic-gate 	uint16_t pr_icmp_id;		/* Identification */
450Sstevel@tonic-gate 	uint16_t pr_icmp_seq;		/* sequence number */
468485SPeter.Memishian@Sun.COM 	uint64_t pr_icmp_timestamp;	/* Time stamp (in ns) */
470Sstevel@tonic-gate 	uint32_t pr_icmp_mtype;		/* Message type */
480Sstevel@tonic-gate };
490Sstevel@tonic-gate 
500Sstevel@tonic-gate static struct in6_addr all_nodes_mcast_v6 = { { 0xff, 0x2, 0x0, 0x0,
510Sstevel@tonic-gate 				    0x0, 0x0, 0x0, 0x0,
520Sstevel@tonic-gate 				    0x0, 0x0, 0x0, 0x0,
530Sstevel@tonic-gate 				    0x0, 0x0, 0x0, 0x1 } };
540Sstevel@tonic-gate 
550Sstevel@tonic-gate static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } };
560Sstevel@tonic-gate 
570Sstevel@tonic-gate static hrtime_t	last_fdt_bumpup_time;	/* When FDT was bumped up last */
580Sstevel@tonic-gate 
598485SPeter.Memishian@Sun.COM static void		*find_ancillary(struct msghdr *msg, int cmsg_level,
608485SPeter.Memishian@Sun.COM     int cmsg_type);
618485SPeter.Memishian@Sun.COM static void		pi_set_crtt(struct target *tg, int64_t m,
620Sstevel@tonic-gate     boolean_t is_probe_uni);
630Sstevel@tonic-gate static void		incoming_echo_reply(struct phyint_instance *pii,
648485SPeter.Memishian@Sun.COM     struct pr_icmp *reply, struct in6_addr fromaddr, struct timeval *recv_tvp);
650Sstevel@tonic-gate static void		incoming_rtt_reply(struct phyint_instance *pii,
660Sstevel@tonic-gate     struct pr_icmp *reply, struct in6_addr fromaddr);
670Sstevel@tonic-gate static void		incoming_mcast_reply(struct phyint_instance *pii,
680Sstevel@tonic-gate     struct pr_icmp *reply, struct in6_addr fromaddr);
690Sstevel@tonic-gate 
700Sstevel@tonic-gate static boolean_t	check_pg_crtt_improved(struct phyint_group *pg);
710Sstevel@tonic-gate static boolean_t	check_pii_crtt_improved(struct phyint_instance *pii);
720Sstevel@tonic-gate static boolean_t	check_exception_target(struct phyint_instance *pii,
730Sstevel@tonic-gate     struct target *target);
740Sstevel@tonic-gate static void		probe_fail_info(struct phyint_instance *pii,
750Sstevel@tonic-gate     struct target *cur_tg, struct probe_fail_count *pfinfo);
760Sstevel@tonic-gate static void		probe_success_info(struct phyint_instance *pii,
770Sstevel@tonic-gate     struct target *cur_tg, struct probe_success_count *psinfo);
780Sstevel@tonic-gate static boolean_t	phyint_repaired(struct phyint *pi);
790Sstevel@tonic-gate 
800Sstevel@tonic-gate static boolean_t	highest_ack_tg(uint16_t seq, struct target *tg);
810Sstevel@tonic-gate static int 		in_cksum(ushort_t *addr, int len);
820Sstevel@tonic-gate static void		reset_snxt_basetimes(void);
838485SPeter.Memishian@Sun.COM static int		ns2ms(int64_t ns);
848485SPeter.Memishian@Sun.COM static int64_t		tv2ns(struct timeval *);
850Sstevel@tonic-gate 
860Sstevel@tonic-gate /*
870Sstevel@tonic-gate  * CRTT - Conservative Round Trip Time Estimate
880Sstevel@tonic-gate  * Probe success - A matching probe reply received before CRTT ms has elapsed
890Sstevel@tonic-gate  *	after sending the probe.
900Sstevel@tonic-gate  * Probe failure - No probe reply received and more than CRTT ms has elapsed
910Sstevel@tonic-gate  *	after sending the probe.
920Sstevel@tonic-gate  *
930Sstevel@tonic-gate  * TLS - Time last success. Most recent probe ack received at this time.
940Sstevel@tonic-gate  * TFF - Time first fail. The time of the earliest probe failure in
950Sstevel@tonic-gate  *	a consecutive series of probe failures.
960Sstevel@tonic-gate  * NUM_PROBE_REPAIRS  - Number of consecutive successful probes required
970Sstevel@tonic-gate  * 	before declaring phyint repair.
980Sstevel@tonic-gate  * NUM_PROBE_FAILS - Number of consecutive probe failures required to
990Sstevel@tonic-gate  *	declare a phyint failure.
1000Sstevel@tonic-gate  *
1010Sstevel@tonic-gate  * 			Phyint state diagram
1020Sstevel@tonic-gate  *
1030Sstevel@tonic-gate  * The state of a phyint that is capable of being probed, is completely
1048485SPeter.Memishian@Sun.COM  * specified by the 3-tuple <pi_state, pg_state, I>.
1050Sstevel@tonic-gate  *
10610290SPeter.Memishian@Sun.COM  * A phyint starts in either PI_RUNNING or PI_OFFLINE, depending on whether
10710290SPeter.Memishian@Sun.COM  * IFF_OFFLINE is set.  If the phyint is also configured with a test address
10810290SPeter.Memishian@Sun.COM  * (the common case) and probe targets, then a phyint must also successfully
10910290SPeter.Memishian@Sun.COM  * be able to send and receive probes in order to remain in the PI_RUNNING
11010290SPeter.Memishian@Sun.COM  * state (otherwise, it transitions to PI_FAILED).
1110Sstevel@tonic-gate  *
1120Sstevel@tonic-gate  * Further, if a PI_RUNNING phyint is configured with a test address but is
1130Sstevel@tonic-gate  * unable to find any probe targets, it will transition to the PI_NOTARGETS
1140Sstevel@tonic-gate  * state, which indicates that the link is apparently functional but that
1150Sstevel@tonic-gate  * in.mpathd is unable to send probes to verify functionality (in this case,
1160Sstevel@tonic-gate  * in.mpathd makes the optimistic assumption that the interface is working
1178485SPeter.Memishian@Sun.COM  * correctly and thus does not mark the interface FAILED, but reports it as
1188485SPeter.Memishian@Sun.COM  * IPMP_IF_UNKNOWN through the async events and query interfaces).
1190Sstevel@tonic-gate  *
1200Sstevel@tonic-gate  * At any point, a phyint may be administratively marked offline via if_mpadm.
1210Sstevel@tonic-gate  * In this case, the interface always transitions to PI_OFFLINE, regardless
1220Sstevel@tonic-gate  * of its previous state.  When the interface is later brought back online,
1230Sstevel@tonic-gate  * in.mpathd acts as if the interface is new (and thus it transitions to
1240Sstevel@tonic-gate  * PI_RUNNING or PI_FAILED based on the status of the link and the result of
1250Sstevel@tonic-gate  * its probes, if probes are sent).
1260Sstevel@tonic-gate  *
1270Sstevel@tonic-gate  * pi_state -  PI_RUNNING or PI_FAILED
1280Sstevel@tonic-gate  *	PI_RUNNING: The failure detection logic says the phyint is good.
1290Sstevel@tonic-gate  *	PI_FAILED: The failure detection logic says the phyint has failed.
1300Sstevel@tonic-gate  *
1318485SPeter.Memishian@Sun.COM  * pg_state  - PG_OK, PG_DEGRADED, or PG_FAILED.
1328485SPeter.Memishian@Sun.COM  *	PG_OK: All interfaces in the group are OK.
1338485SPeter.Memishian@Sun.COM  *	PG_DEGRADED: Some interfaces in the group are unusable.
1348485SPeter.Memishian@Sun.COM  *	PG_FAILED: All interfaces in the group are unusable.
1358485SPeter.Memishian@Sun.COM  *
1360Sstevel@tonic-gate  *	In the case of router targets, we assume that the current list of
1370Sstevel@tonic-gate  *	targets obtained from the routing table, is still valid, so the
1380Sstevel@tonic-gate  *	phyint stat is PI_FAILED. In the case of host targets, we delete the
1390Sstevel@tonic-gate  *	list of targets, and multicast to the all hosts, to reconstruct the
1400Sstevel@tonic-gate  *	target list. So the phyints are in the PI_NOTARGETS state.
1410Sstevel@tonic-gate  *
1420Sstevel@tonic-gate  * I -	value of (pi_flags & IFF_INACTIVE)
1438485SPeter.Memishian@Sun.COM  *	IFF_INACTIVE: This phyint will not send or receive packets.
1448485SPeter.Memishian@Sun.COM  *	Usually, inactive is tied to standby interfaces that are not yet
1458485SPeter.Memishian@Sun.COM  *	needed (e.g., no non-standby interfaces in the group have failed).
1468485SPeter.Memishian@Sun.COM  *	When failback has been disabled (FAILBACK=no configured), phyint can
1478485SPeter.Memishian@Sun.COM  *	also be a non-STANDBY. In this case IFF_INACTIVE is set when phyint
1488485SPeter.Memishian@Sun.COM  *	subsequently recovers after a failure.
1490Sstevel@tonic-gate  *
1508485SPeter.Memishian@Sun.COM  * Not all 9 possible combinations of the above 3-tuple are possible.
1510Sstevel@tonic-gate  *
1528485SPeter.Memishian@Sun.COM  * I is tracked by IP. pi_state is tracked by mpathd.
1530Sstevel@tonic-gate  *
1540Sstevel@tonic-gate  *			pi_state state machine
1550Sstevel@tonic-gate  * ---------------------------------------------------------------------------
1560Sstevel@tonic-gate  *	Event			State			New State
1570Sstevel@tonic-gate  *				Action:
1580Sstevel@tonic-gate  * ---------------------------------------------------------------------------
1598485SPeter.Memishian@Sun.COM  *	IP interface failure	(PI_RUNNING, I == 0) -> (PI_FAILED, I == 0)
1600Sstevel@tonic-gate  *	detection		: set IFF_FAILED on this phyint
1610Sstevel@tonic-gate  *
1628485SPeter.Memishian@Sun.COM  *	IP interface failure	(PI_RUNNING, I == 1) -> (PI_FAILED, I == 0)
1630Sstevel@tonic-gate  *	detection		: set IFF_FAILED on this phyint
1640Sstevel@tonic-gate  *
1658485SPeter.Memishian@Sun.COM  *	IP interface repair 	(PI_FAILED, I == 0, FAILBACK=yes)
166704Sethindra  *	detection				     -> (PI_RUNNING, I == 0)
1670Sstevel@tonic-gate  *				: clear IFF_FAILED on this phyint
1680Sstevel@tonic-gate  *
1698485SPeter.Memishian@Sun.COM  *	IP interface repair 	(PI_FAILED, I == 0, FAILBACK=no)
170704Sethindra  *	detection				     ->	(PI_RUNNING, I == 1)
171704Sethindra  *				: clear IFF_FAILED on this phyint
172704Sethindra  *				: if failback is disabled set I == 1
1730Sstevel@tonic-gate  *
1740Sstevel@tonic-gate  *	Group failure		(perform on all phyints in the group)
1750Sstevel@tonic-gate  *	detection 		PI_RUNNING		PI_FAILED
1760Sstevel@tonic-gate  *	(Router targets)	: set IFF_FAILED
1770Sstevel@tonic-gate  *
1780Sstevel@tonic-gate  *	Group failure		(perform on all phyints in the group)
1790Sstevel@tonic-gate  *	detection 		PI_RUNNING		PI_NOTARGETS
1800Sstevel@tonic-gate  *	(Host targets)		: set IFF_FAILED
1810Sstevel@tonic-gate  *				: delete the target list on all phyints
1820Sstevel@tonic-gate  * ---------------------------------------------------------------------------
1830Sstevel@tonic-gate  */
1840Sstevel@tonic-gate 
1850Sstevel@tonic-gate struct probes_missed probes_missed;
1860Sstevel@tonic-gate 
1870Sstevel@tonic-gate /*
1880Sstevel@tonic-gate  * Compose and transmit an ICMP ECHO REQUEST packet.  The IP header
1890Sstevel@tonic-gate  * will be added on by the kernel.  The id field identifies this phyint.
1900Sstevel@tonic-gate  * and the sequence number is an increasing (modulo 2^^16) integer. The data
1910Sstevel@tonic-gate  * portion holds the time value when the packet is sent. On echo this is
1920Sstevel@tonic-gate  * extracted to compute the round-trip time. Three different types of
1930Sstevel@tonic-gate  * probe packets are used.
1940Sstevel@tonic-gate  *
1950Sstevel@tonic-gate  * PROBE_UNI: This type is used to do failure detection / failure recovery
1960Sstevel@tonic-gate  *	and RTT calculation. PROBE_UNI probes are spaced apart in time,
1970Sstevel@tonic-gate  *	not less than the current CRTT. pii_probes[] stores data
1980Sstevel@tonic-gate  *	about these probes. These packets consume sequence number space.
1990Sstevel@tonic-gate  *
2008485SPeter.Memishian@Sun.COM  * PROBE_RTT: This type is used to make only rtt measurements. Normally these
2010Sstevel@tonic-gate  * 	are not used. Under heavy network load, the rtt may go up very high,
2020Sstevel@tonic-gate  *	due to a spike, or may appear to go high, due to extreme scheduling
2030Sstevel@tonic-gate  * 	delays. Once the network stress is removed, mpathd takes long time to
2040Sstevel@tonic-gate  *	recover, because the probe_interval is already high, and it takes
2050Sstevel@tonic-gate  *	a long time to send out sufficient number of probes to bring down the
2060Sstevel@tonic-gate  *	rtt. To avoid this problem, PROBE_RTT probes are sent out every
2070Sstevel@tonic-gate  *	user_probe_interval ms. and will cause only rtt updates. These packets
2080Sstevel@tonic-gate  *	do not consume sequence number space nor is information about these
2090Sstevel@tonic-gate  *	packets stored in the pii_probes[]
2100Sstevel@tonic-gate  *
2110Sstevel@tonic-gate  * PROBE_MULTI: This type is only used to construct a list of targets, when
2120Sstevel@tonic-gate  *	no targets are known. The packet is multicast to the all hosts addr.
2130Sstevel@tonic-gate  */
2140Sstevel@tonic-gate static void
probe(struct phyint_instance * pii,uint_t probe_type,hrtime_t start_hrtime)2158485SPeter.Memishian@Sun.COM probe(struct phyint_instance *pii, uint_t probe_type, hrtime_t start_hrtime)
2160Sstevel@tonic-gate {
2178485SPeter.Memishian@Sun.COM 	hrtime_t sent_hrtime;
2188485SPeter.Memishian@Sun.COM 	struct timeval sent_tv;
2190Sstevel@tonic-gate 	struct pr_icmp probe_pkt;	/* Probe packet */
2208485SPeter.Memishian@Sun.COM 	struct sockaddr_storage targ;	/* target address */
2218485SPeter.Memishian@Sun.COM 	uint_t	targaddrlen;		/* targed address length */
2220Sstevel@tonic-gate 	int	pr_ndx;			/* probe index in pii->pii_probes[] */
22310377SGeorge.Shepherd@Sun.COM 	boolean_t sent = _B_FALSE;
22410377SGeorge.Shepherd@Sun.COM 	int	rval;
2250Sstevel@tonic-gate 
2260Sstevel@tonic-gate 	if (debug & D_TARGET) {
2278485SPeter.Memishian@Sun.COM 		logdebug("probe(%s %s %d %lld)\n", AF_STR(pii->pii_af),
2288485SPeter.Memishian@Sun.COM 		    pii->pii_name, probe_type, start_hrtime);
2290Sstevel@tonic-gate 	}
2300Sstevel@tonic-gate 
2310Sstevel@tonic-gate 	assert(pii->pii_probe_sock != -1);
2320Sstevel@tonic-gate 	assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI ||
2330Sstevel@tonic-gate 	    probe_type == PROBE_RTT);
2340Sstevel@tonic-gate 
2350Sstevel@tonic-gate 	probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ?
2360Sstevel@tonic-gate 	    ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST;
2370Sstevel@tonic-gate 	probe_pkt.pr_icmp_code = 0;
2380Sstevel@tonic-gate 	probe_pkt.pr_icmp_cksum = 0;
2390Sstevel@tonic-gate 	probe_pkt.pr_icmp_seq = htons(pii->pii_snxt);
2400Sstevel@tonic-gate 
2410Sstevel@tonic-gate 	/*
2420Sstevel@tonic-gate 	 * Since there is no need to do arithmetic on the icmpid,
2430Sstevel@tonic-gate 	 * (only equality check is done) pii_icmpid is stored in
2440Sstevel@tonic-gate 	 * network byte order at initialization itself.
2450Sstevel@tonic-gate 	 */
2460Sstevel@tonic-gate 	probe_pkt.pr_icmp_id = pii->pii_icmpid;
2478485SPeter.Memishian@Sun.COM 	probe_pkt.pr_icmp_timestamp = htonll(start_hrtime);
2480Sstevel@tonic-gate 	probe_pkt.pr_icmp_mtype = htonl(probe_type);
2490Sstevel@tonic-gate 
2500Sstevel@tonic-gate 	/*
2510Sstevel@tonic-gate 	 * If probe_type is PROBE_MULTI, this packet will be multicast to
2520Sstevel@tonic-gate 	 * the all hosts address. Otherwise it is unicast to the next target.
2530Sstevel@tonic-gate 	 */
2540Sstevel@tonic-gate 	assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) &&
2550Sstevel@tonic-gate 	    pii->pii_rtt_target_next != NULL));
2560Sstevel@tonic-gate 
2578485SPeter.Memishian@Sun.COM 	bzero(&targ, sizeof (targ));
2588485SPeter.Memishian@Sun.COM 	targ.ss_family = pii->pii_af;
2598485SPeter.Memishian@Sun.COM 
2600Sstevel@tonic-gate 	if (pii->pii_af == AF_INET6) {
2618485SPeter.Memishian@Sun.COM 		struct in6_addr *addr6;
2628485SPeter.Memishian@Sun.COM 
2638485SPeter.Memishian@Sun.COM 		addr6 = &((struct sockaddr_in6 *)&targ)->sin6_addr;
2648485SPeter.Memishian@Sun.COM 		targaddrlen = sizeof (struct sockaddr_in6);
2650Sstevel@tonic-gate 		if (probe_type == PROBE_MULTI) {
2668485SPeter.Memishian@Sun.COM 			*addr6 = all_nodes_mcast_v6;
2670Sstevel@tonic-gate 		} else if (probe_type == PROBE_UNI) {
2688485SPeter.Memishian@Sun.COM 			*addr6 = pii->pii_target_next->tg_address;
2698485SPeter.Memishian@Sun.COM 		} else { /* type is PROBE_RTT */
2708485SPeter.Memishian@Sun.COM 			*addr6 = pii->pii_rtt_target_next->tg_address;
2710Sstevel@tonic-gate 		}
2720Sstevel@tonic-gate 	} else {
2738485SPeter.Memishian@Sun.COM 		struct in_addr *addr4;
2748485SPeter.Memishian@Sun.COM 
2758485SPeter.Memishian@Sun.COM 		addr4 = &((struct sockaddr_in *)&targ)->sin_addr;
2768485SPeter.Memishian@Sun.COM 		targaddrlen = sizeof (struct sockaddr_in);
2770Sstevel@tonic-gate 		if (probe_type == PROBE_MULTI) {
2788485SPeter.Memishian@Sun.COM 			*addr4 = all_nodes_mcast_v4;
2790Sstevel@tonic-gate 		} else if (probe_type == PROBE_UNI) {
2800Sstevel@tonic-gate 			IN6_V4MAPPED_TO_INADDR(
2818485SPeter.Memishian@Sun.COM 			    &pii->pii_target_next->tg_address, addr4);
2828485SPeter.Memishian@Sun.COM 		} else { /* type is PROBE_RTT */
2830Sstevel@tonic-gate 			IN6_V4MAPPED_TO_INADDR(
2848485SPeter.Memishian@Sun.COM 			    &pii->pii_rtt_target_next->tg_address, addr4);
2850Sstevel@tonic-gate 		}
2860Sstevel@tonic-gate 
2870Sstevel@tonic-gate 		/*
2880Sstevel@tonic-gate 		 * Compute the IPv4 icmp checksum. Does not cover the IP header.
2890Sstevel@tonic-gate 		 */
2900Sstevel@tonic-gate 		probe_pkt.pr_icmp_cksum =
2910Sstevel@tonic-gate 		    in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt));
2928485SPeter.Memishian@Sun.COM 	}
2938485SPeter.Memishian@Sun.COM 
2948485SPeter.Memishian@Sun.COM 	/*
2958485SPeter.Memishian@Sun.COM 	 * Use the current time as the time we sent.  Not atomic, but the best
2968485SPeter.Memishian@Sun.COM 	 * we can do from here.
2978485SPeter.Memishian@Sun.COM 	 */
2988485SPeter.Memishian@Sun.COM 	sent_hrtime = gethrtime();
2998485SPeter.Memishian@Sun.COM 	(void) gettimeofday(&sent_tv, NULL);
30010377SGeorge.Shepherd@Sun.COM 	rval = sendto(pii->pii_probe_sock, &probe_pkt, sizeof (probe_pkt), 0,
30110377SGeorge.Shepherd@Sun.COM 	    (struct sockaddr *)&targ, targaddrlen);
30210377SGeorge.Shepherd@Sun.COM 	/*
30310377SGeorge.Shepherd@Sun.COM 	 * If the send would block, this may either be transient or a hang in a
30410377SGeorge.Shepherd@Sun.COM 	 * lower layer. We pretend the probe was actually sent, the daemon will
30510377SGeorge.Shepherd@Sun.COM 	 * not see a reply to the probe and will fail the interface if normal
30610377SGeorge.Shepherd@Sun.COM 	 * failure detection criteria are met.
30710377SGeorge.Shepherd@Sun.COM 	 */
30810377SGeorge.Shepherd@Sun.COM 	if (rval == sizeof (probe_pkt) ||
30910377SGeorge.Shepherd@Sun.COM 	    (rval == -1 && errno == EWOULDBLOCK)) {
31010377SGeorge.Shepherd@Sun.COM 		sent = _B_TRUE;
31110377SGeorge.Shepherd@Sun.COM 	} else {
3128485SPeter.Memishian@Sun.COM 		logperror_pii(pii, "probe: probe sendto");
3130Sstevel@tonic-gate 	}
3140Sstevel@tonic-gate 
3150Sstevel@tonic-gate 	/*
3160Sstevel@tonic-gate 	 * If this is a PROBE_UNI probe packet being unicast to a target, then
3170Sstevel@tonic-gate 	 * update our tables. We will need this info in processing the probe
3180Sstevel@tonic-gate 	 * response. PROBE_MULTI and PROBE_RTT packets are not used for
3190Sstevel@tonic-gate 	 * the purpose of failure or recovery detection. PROBE_MULTI packets
3200Sstevel@tonic-gate 	 * are only used to construct a list of targets. PROBE_RTT packets are
3210Sstevel@tonic-gate 	 * used only for updating the rtt and not for failure detection.
3220Sstevel@tonic-gate 	 */
3230Sstevel@tonic-gate 	if (probe_type == PROBE_UNI && sent) {
3240Sstevel@tonic-gate 		pr_ndx = pii->pii_probe_next;
3250Sstevel@tonic-gate 		assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT);
3260Sstevel@tonic-gate 
3270Sstevel@tonic-gate 		/* Collect statistics, before we reuse the last slot. */
3280Sstevel@tonic-gate 		if (pii->pii_probes[pr_ndx].pr_status == PR_LOST)
3290Sstevel@tonic-gate 			pii->pii_cum_stats.lost++;
3300Sstevel@tonic-gate 		else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED)
3310Sstevel@tonic-gate 			pii->pii_cum_stats.acked++;
3320Sstevel@tonic-gate 		pii->pii_cum_stats.sent++;
3330Sstevel@tonic-gate 
3348485SPeter.Memishian@Sun.COM 		pii->pii_probes[pr_ndx].pr_id = pii->pii_snxt;
3358485SPeter.Memishian@Sun.COM 		pii->pii_probes[pr_ndx].pr_tv_sent = sent_tv;
3368485SPeter.Memishian@Sun.COM 		pii->pii_probes[pr_ndx].pr_hrtime_sent = sent_hrtime;
3378485SPeter.Memishian@Sun.COM 		pii->pii_probes[pr_ndx].pr_hrtime_start = start_hrtime;
3380Sstevel@tonic-gate 		pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next;
3398485SPeter.Memishian@Sun.COM 		probe_chstate(&pii->pii_probes[pr_ndx], pii, PR_UNACKED);
3408485SPeter.Memishian@Sun.COM 
3410Sstevel@tonic-gate 		pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next);
3420Sstevel@tonic-gate 		pii->pii_target_next = target_next(pii->pii_target_next);
3430Sstevel@tonic-gate 		assert(pii->pii_target_next != NULL);
3440Sstevel@tonic-gate 		/*
3450Sstevel@tonic-gate 		 * If we have a single variable to denote the next target to
3460Sstevel@tonic-gate 		 * probe for both rtt probes and failure detection probes, we
3470Sstevel@tonic-gate 		 * could end up with a situation where the failure detection
3480Sstevel@tonic-gate 		 * probe targets become disjoint from the rtt probe targets.
3490Sstevel@tonic-gate 		 * Eg. if 2 targets and the actual fdt is double the user
3500Sstevel@tonic-gate 		 * specified fdt. So we have 2 variables. In this scheme
3510Sstevel@tonic-gate 		 * we also reset pii_rtt_target_next for every fdt probe,
3520Sstevel@tonic-gate 		 * though that may not be necessary.
3530Sstevel@tonic-gate 		 */
3540Sstevel@tonic-gate 		pii->pii_rtt_target_next = pii->pii_target_next;
3550Sstevel@tonic-gate 		pii->pii_snxt++;
3560Sstevel@tonic-gate 	} else if (probe_type == PROBE_RTT) {
3570Sstevel@tonic-gate 		pii->pii_rtt_target_next =
3580Sstevel@tonic-gate 		    target_next(pii->pii_rtt_target_next);
3590Sstevel@tonic-gate 		assert(pii->pii_rtt_target_next != NULL);
3600Sstevel@tonic-gate 	}
3610Sstevel@tonic-gate }
3620Sstevel@tonic-gate 
3630Sstevel@tonic-gate /*
3640Sstevel@tonic-gate  * Incoming IPv4 data from wire, is received here. Called from main.
3650Sstevel@tonic-gate  */
3660Sstevel@tonic-gate void
in_data(struct phyint_instance * pii)3670Sstevel@tonic-gate in_data(struct phyint_instance *pii)
3680Sstevel@tonic-gate {
3690Sstevel@tonic-gate 	struct	sockaddr_in 	from;
3700Sstevel@tonic-gate 	struct	in6_addr	fromaddr;
3718485SPeter.Memishian@Sun.COM 	static uint64_t in_packet[(IP_MAXPACKET + 1)/8];
3728485SPeter.Memishian@Sun.COM 	static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8];
3730Sstevel@tonic-gate 	struct ip *ip;
3740Sstevel@tonic-gate 	int 	iphlen;
3750Sstevel@tonic-gate 	int 	len;
3760Sstevel@tonic-gate 	char 	abuf[INET_ADDRSTRLEN];
3778485SPeter.Memishian@Sun.COM 	struct msghdr msg;
3788485SPeter.Memishian@Sun.COM 	struct iovec iov;
3798485SPeter.Memishian@Sun.COM 	struct pr_icmp *reply;
3808485SPeter.Memishian@Sun.COM 	struct timeval *recv_tvp;
3810Sstevel@tonic-gate 
3820Sstevel@tonic-gate 	if (debug & D_PROBE) {
3830Sstevel@tonic-gate 		logdebug("in_data(%s %s)\n",
3840Sstevel@tonic-gate 		    AF_STR(pii->pii_af), pii->pii_name);
3850Sstevel@tonic-gate 	}
3860Sstevel@tonic-gate 
3878485SPeter.Memishian@Sun.COM 	iov.iov_base = (char *)in_packet;
3888485SPeter.Memishian@Sun.COM 	iov.iov_len = sizeof (in_packet);
3898485SPeter.Memishian@Sun.COM 	msg.msg_iov = &iov;
3908485SPeter.Memishian@Sun.COM 	msg.msg_iovlen = 1;
3918485SPeter.Memishian@Sun.COM 	msg.msg_name = (struct sockaddr *)&from;
3928485SPeter.Memishian@Sun.COM 	msg.msg_namelen = sizeof (from);
3938485SPeter.Memishian@Sun.COM 	msg.msg_control = ancillary_data;
3948485SPeter.Memishian@Sun.COM 	msg.msg_controllen = sizeof (ancillary_data);
3958485SPeter.Memishian@Sun.COM 
3960Sstevel@tonic-gate 	/*
3970Sstevel@tonic-gate 	 * Poll has already told us that a message is waiting,
3980Sstevel@tonic-gate 	 * on this socket. Read it now. We should not block.
3990Sstevel@tonic-gate 	 */
4008485SPeter.Memishian@Sun.COM 	if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) {
4018485SPeter.Memishian@Sun.COM 		logperror_pii(pii, "in_data: recvmsg");
4020Sstevel@tonic-gate 		return;
4030Sstevel@tonic-gate 	}
4040Sstevel@tonic-gate 
4050Sstevel@tonic-gate 	/*
4068485SPeter.Memishian@Sun.COM 	 * If the datalink has indicated the link is down, don't go
4070Sstevel@tonic-gate 	 * any further.
4080Sstevel@tonic-gate 	 */
4090Sstevel@tonic-gate 	if (LINK_DOWN(pii->pii_phyint))
4100Sstevel@tonic-gate 		return;
4110Sstevel@tonic-gate 
4120Sstevel@tonic-gate 	/* Get the printable address for error reporting */
4130Sstevel@tonic-gate 	(void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf));
4140Sstevel@tonic-gate 
4158485SPeter.Memishian@Sun.COM 	/* Ignore packets > 64k or control buffers that don't fit */
4168485SPeter.Memishian@Sun.COM 	if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) {
4178485SPeter.Memishian@Sun.COM 		if (debug & D_PKTBAD) {
4188485SPeter.Memishian@Sun.COM 			logdebug("Truncated message: msg_flags 0x%x from %s\n",
4198485SPeter.Memishian@Sun.COM 			    msg.msg_flags, abuf);
4208485SPeter.Memishian@Sun.COM 		}
4218485SPeter.Memishian@Sun.COM 		return;
4228485SPeter.Memishian@Sun.COM 	}
4238485SPeter.Memishian@Sun.COM 
4240Sstevel@tonic-gate 	/* Make sure packet contains at least minimum ICMP header */
4250Sstevel@tonic-gate 	ip = (struct ip *)in_packet;
4260Sstevel@tonic-gate 	iphlen = ip->ip_hl << 2;
4270Sstevel@tonic-gate 	if (len < iphlen + ICMP_MINLEN) {
4280Sstevel@tonic-gate 		if (debug & D_PKTBAD) {
4290Sstevel@tonic-gate 			logdebug("in_data: packet too short (%d bytes)"
4300Sstevel@tonic-gate 			    " from %s\n", len, abuf);
4310Sstevel@tonic-gate 		}
4320Sstevel@tonic-gate 		return;
4330Sstevel@tonic-gate 	}
4340Sstevel@tonic-gate 
4350Sstevel@tonic-gate 	/*
4360Sstevel@tonic-gate 	 * Subtract the IP hdr length, 'len' will be length of the probe
4370Sstevel@tonic-gate 	 * reply, starting from the icmp hdr.
4380Sstevel@tonic-gate 	 */
4390Sstevel@tonic-gate 	len -= iphlen;
4400Sstevel@tonic-gate 	/* LINTED */
4410Sstevel@tonic-gate 	reply = (struct pr_icmp *)((char *)in_packet + iphlen);
4420Sstevel@tonic-gate 
4430Sstevel@tonic-gate 	/* Probe replies are icmp echo replies. Ignore anything else */
4440Sstevel@tonic-gate 	if (reply->pr_icmp_type != ICMP_ECHO_REPLY)
4450Sstevel@tonic-gate 		return;
4460Sstevel@tonic-gate 
4470Sstevel@tonic-gate 	/*
4480Sstevel@tonic-gate 	 * The icmp id should match what we sent, which is stored
4490Sstevel@tonic-gate 	 * in pi_icmpid. The icmp code for reply must be 0.
4500Sstevel@tonic-gate 	 * The reply content must be a struct pr_icmp
4510Sstevel@tonic-gate 	 */
4520Sstevel@tonic-gate 	if (reply->pr_icmp_id != pii->pii_icmpid) {
4530Sstevel@tonic-gate 		/* Not in response to our probe */
4540Sstevel@tonic-gate 		return;
4550Sstevel@tonic-gate 	}
4560Sstevel@tonic-gate 
4570Sstevel@tonic-gate 	if (reply->pr_icmp_code != 0) {
4580Sstevel@tonic-gate 		logtrace("probe reply code %d from %s on %s\n",
4590Sstevel@tonic-gate 		    reply->pr_icmp_code, abuf, pii->pii_name);
4600Sstevel@tonic-gate 		return;
4610Sstevel@tonic-gate 	}
4620Sstevel@tonic-gate 
4630Sstevel@tonic-gate 	if (len < sizeof (struct pr_icmp)) {
4640Sstevel@tonic-gate 		logtrace("probe reply too short: %d bytes from %s on %s\n",
4650Sstevel@tonic-gate 		    len, abuf, pii->pii_name);
4660Sstevel@tonic-gate 		return;
4670Sstevel@tonic-gate 	}
4680Sstevel@tonic-gate 
4698485SPeter.Memishian@Sun.COM 	recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP);
4708485SPeter.Memishian@Sun.COM 	if (recv_tvp == NULL) {
4718485SPeter.Memishian@Sun.COM 		logtrace("message without timestamp from %s on %s\n",
4728485SPeter.Memishian@Sun.COM 		    abuf, pii->pii_name);
4738485SPeter.Memishian@Sun.COM 		return;
4748485SPeter.Memishian@Sun.COM 	}
4758485SPeter.Memishian@Sun.COM 
4760Sstevel@tonic-gate 	IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr);
4770Sstevel@tonic-gate 	if (reply->pr_icmp_mtype == htonl(PROBE_UNI))
4780Sstevel@tonic-gate 		/* Unicast probe reply */
4798485SPeter.Memishian@Sun.COM 		incoming_echo_reply(pii, reply, fromaddr, recv_tvp);
4800Sstevel@tonic-gate 	else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
4810Sstevel@tonic-gate 		/* Multicast reply */
4820Sstevel@tonic-gate 		incoming_mcast_reply(pii, reply, fromaddr);
4830Sstevel@tonic-gate 	} else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
4840Sstevel@tonic-gate 		incoming_rtt_reply(pii, reply, fromaddr);
4850Sstevel@tonic-gate 	} else {
4860Sstevel@tonic-gate 		/* Probably not in response to our probe */
4870Sstevel@tonic-gate 		logtrace("probe reply type: %d from %s on %s\n",
4880Sstevel@tonic-gate 		    reply->pr_icmp_mtype, abuf, pii->pii_name);
4890Sstevel@tonic-gate 		return;
4900Sstevel@tonic-gate 	}
4910Sstevel@tonic-gate }
4920Sstevel@tonic-gate 
4930Sstevel@tonic-gate /*
4940Sstevel@tonic-gate  * Incoming IPv6 data from wire is received here. Called from main.
4950Sstevel@tonic-gate  */
4960Sstevel@tonic-gate void
in6_data(struct phyint_instance * pii)4970Sstevel@tonic-gate in6_data(struct phyint_instance *pii)
4980Sstevel@tonic-gate {
4990Sstevel@tonic-gate 	struct sockaddr_in6 from;
5000Sstevel@tonic-gate 	static uint64_t in_packet[(IP_MAXPACKET + 1)/8];
5010Sstevel@tonic-gate 	static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8];
5020Sstevel@tonic-gate 	int len;
5030Sstevel@tonic-gate 	char abuf[INET6_ADDRSTRLEN];
5040Sstevel@tonic-gate 	struct msghdr msg;
5050Sstevel@tonic-gate 	struct iovec iov;
5068485SPeter.Memishian@Sun.COM 	void	*opt;
5070Sstevel@tonic-gate 	struct	pr_icmp *reply;
5088485SPeter.Memishian@Sun.COM 	struct	timeval *recv_tvp;
5090Sstevel@tonic-gate 
5100Sstevel@tonic-gate 	if (debug & D_PROBE) {
5110Sstevel@tonic-gate 		logdebug("in6_data(%s %s)\n",
5120Sstevel@tonic-gate 		    AF_STR(pii->pii_af), pii->pii_name);
5130Sstevel@tonic-gate 	}
5140Sstevel@tonic-gate 
5150Sstevel@tonic-gate 	iov.iov_base = (char *)in_packet;
5160Sstevel@tonic-gate 	iov.iov_len = sizeof (in_packet);
5170Sstevel@tonic-gate 	msg.msg_iov = &iov;
5180Sstevel@tonic-gate 	msg.msg_iovlen = 1;
5190Sstevel@tonic-gate 	msg.msg_name = (struct sockaddr *)&from;
5200Sstevel@tonic-gate 	msg.msg_namelen = sizeof (from);
5210Sstevel@tonic-gate 	msg.msg_control = ancillary_data;
5220Sstevel@tonic-gate 	msg.msg_controllen = sizeof (ancillary_data);
5230Sstevel@tonic-gate 
5240Sstevel@tonic-gate 	if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) {
5258485SPeter.Memishian@Sun.COM 		logperror_pii(pii, "in6_data: recvmsg");
5260Sstevel@tonic-gate 		return;
5270Sstevel@tonic-gate 	}
5280Sstevel@tonic-gate 
5290Sstevel@tonic-gate 	/*
5308485SPeter.Memishian@Sun.COM 	 * If the datalink has indicated that the link is down, don't go
5310Sstevel@tonic-gate 	 * any further.
5320Sstevel@tonic-gate 	 */
5330Sstevel@tonic-gate 	if (LINK_DOWN(pii->pii_phyint))
5340Sstevel@tonic-gate 		return;
5350Sstevel@tonic-gate 
5360Sstevel@tonic-gate 	/* Get the printable address for error reporting */
5370Sstevel@tonic-gate 	(void) inet_ntop(AF_INET6, &from.sin6_addr, abuf, sizeof (abuf));
5380Sstevel@tonic-gate 	if (len < ICMP_MINLEN) {
5390Sstevel@tonic-gate 		if (debug & D_PKTBAD) {
5400Sstevel@tonic-gate 			logdebug("Truncated message: msg_flags 0x%x from %s\n",
5410Sstevel@tonic-gate 			    msg.msg_flags, abuf);
5420Sstevel@tonic-gate 		}
5430Sstevel@tonic-gate 		return;
5440Sstevel@tonic-gate 	}
5450Sstevel@tonic-gate 	/* Ignore packets > 64k or control buffers that don't fit */
5460Sstevel@tonic-gate 	if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) {
5470Sstevel@tonic-gate 		if (debug & D_PKTBAD) {
5480Sstevel@tonic-gate 			logdebug("Truncated message: msg_flags 0x%x from %s\n",
5490Sstevel@tonic-gate 			    msg.msg_flags, abuf);
5500Sstevel@tonic-gate 		}
5510Sstevel@tonic-gate 		return;
5520Sstevel@tonic-gate 	}
5530Sstevel@tonic-gate 
5540Sstevel@tonic-gate 	reply = (struct pr_icmp *)in_packet;
5550Sstevel@tonic-gate 	if (reply->pr_icmp_type != ICMP6_ECHO_REPLY)
5560Sstevel@tonic-gate 		return;
5570Sstevel@tonic-gate 
5580Sstevel@tonic-gate 	if (reply->pr_icmp_id != pii->pii_icmpid) {
5590Sstevel@tonic-gate 		/* Not in response to our probe */
5600Sstevel@tonic-gate 		return;
5610Sstevel@tonic-gate 	}
5620Sstevel@tonic-gate 
5630Sstevel@tonic-gate 	/*
5640Sstevel@tonic-gate 	 * The kernel has already verified the the ICMP checksum.
5650Sstevel@tonic-gate 	 */
5660Sstevel@tonic-gate 	if (!IN6_IS_ADDR_LINKLOCAL(&from.sin6_addr)) {
5670Sstevel@tonic-gate 		logtrace("ICMPv6 echo reply source address not linklocal from "
5680Sstevel@tonic-gate 		    "%s on %s\n", abuf, pii->pii_name);
5690Sstevel@tonic-gate 		return;
5700Sstevel@tonic-gate 	}
5718485SPeter.Memishian@Sun.COM 	opt = find_ancillary(&msg, IPPROTO_IPV6, IPV6_RTHDR);
5720Sstevel@tonic-gate 	if (opt != NULL) {
5730Sstevel@tonic-gate 		/* Can't allow routing headers in probe replies  */
5740Sstevel@tonic-gate 		logtrace("message with routing header from %s on %s\n",
5750Sstevel@tonic-gate 		    abuf, pii->pii_name);
5760Sstevel@tonic-gate 		return;
5770Sstevel@tonic-gate 	}
5788485SPeter.Memishian@Sun.COM 
5790Sstevel@tonic-gate 	if (reply->pr_icmp_code != 0) {
5800Sstevel@tonic-gate 		logtrace("probe reply code: %d from %s on %s\n",
5810Sstevel@tonic-gate 		    reply->pr_icmp_code, abuf, pii->pii_name);
5820Sstevel@tonic-gate 		return;
5830Sstevel@tonic-gate 	}
5840Sstevel@tonic-gate 	if (len < (sizeof (struct pr_icmp))) {
5850Sstevel@tonic-gate 		logtrace("probe reply too short: %d bytes from %s on %s\n",
5860Sstevel@tonic-gate 		    len, abuf, pii->pii_name);
5870Sstevel@tonic-gate 		return;
5880Sstevel@tonic-gate 	}
5898485SPeter.Memishian@Sun.COM 
5908485SPeter.Memishian@Sun.COM 	recv_tvp = find_ancillary(&msg, SOL_SOCKET, SCM_TIMESTAMP);
5918485SPeter.Memishian@Sun.COM 	if (recv_tvp == NULL) {
5928485SPeter.Memishian@Sun.COM 		logtrace("message without timestamp from %s on %s\n",
5938485SPeter.Memishian@Sun.COM 		    abuf, pii->pii_name);
5948485SPeter.Memishian@Sun.COM 		return;
5958485SPeter.Memishian@Sun.COM 	}
5968485SPeter.Memishian@Sun.COM 
5970Sstevel@tonic-gate 	if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) {
5988485SPeter.Memishian@Sun.COM 		incoming_echo_reply(pii, reply, from.sin6_addr, recv_tvp);
5990Sstevel@tonic-gate 	} else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) {
6000Sstevel@tonic-gate 		incoming_mcast_reply(pii, reply, from.sin6_addr);
6010Sstevel@tonic-gate 	} else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) {
6020Sstevel@tonic-gate 		incoming_rtt_reply(pii, reply, from.sin6_addr);
6030Sstevel@tonic-gate 	} else  {
6040Sstevel@tonic-gate 		/* Probably not in response to our probe */
6050Sstevel@tonic-gate 		logtrace("probe reply type: %d from %s on %s\n",
6060Sstevel@tonic-gate 		    reply->pr_icmp_mtype, abuf, pii->pii_name);
6070Sstevel@tonic-gate 	}
6080Sstevel@tonic-gate }
6090Sstevel@tonic-gate 
6100Sstevel@tonic-gate /*
6110Sstevel@tonic-gate  * Process the incoming rtt reply, in response to our rtt probe.
6120Sstevel@tonic-gate  * Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't
6130Sstevel@tonic-gate  * have any stored information about the probe we sent. So we don't log
6140Sstevel@tonic-gate  * any errors if we receive bad replies.
6150Sstevel@tonic-gate  */
6160Sstevel@tonic-gate static void
incoming_rtt_reply(struct phyint_instance * pii,struct pr_icmp * reply,struct in6_addr fromaddr)6170Sstevel@tonic-gate incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply,
6180Sstevel@tonic-gate     struct in6_addr fromaddr)
6190Sstevel@tonic-gate {
6208485SPeter.Memishian@Sun.COM 	int64_t	m;		/* rtt measurement in ns */
6210Sstevel@tonic-gate 	char	abuf[INET6_ADDRSTRLEN];
6220Sstevel@tonic-gate 	struct	target	*target;
6230Sstevel@tonic-gate 	struct 	phyint_group *pg;
6240Sstevel@tonic-gate 
6250Sstevel@tonic-gate 	/* Get the printable address for error reporting */
6260Sstevel@tonic-gate 	(void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
6270Sstevel@tonic-gate 
6280Sstevel@tonic-gate 	if (debug & D_PROBE) {
6290Sstevel@tonic-gate 		logdebug("incoming_rtt_reply: %s %s %s\n",
6300Sstevel@tonic-gate 		    AF_STR(pii->pii_af), pii->pii_name, abuf);
6310Sstevel@tonic-gate 	}
6320Sstevel@tonic-gate 
6330Sstevel@tonic-gate 	/* Do we know this target ? */
6340Sstevel@tonic-gate 	target = target_lookup(pii, fromaddr);
6350Sstevel@tonic-gate 	if (target == NULL)
6360Sstevel@tonic-gate 		return;
6370Sstevel@tonic-gate 
6388485SPeter.Memishian@Sun.COM 	m = (int64_t)(gethrtime() - ntohll(reply->pr_icmp_timestamp));
6390Sstevel@tonic-gate 	/* Invalid rtt. It has wrapped around */
6400Sstevel@tonic-gate 	if (m < 0)
6410Sstevel@tonic-gate 		return;
6420Sstevel@tonic-gate 
6430Sstevel@tonic-gate 	/*
6440Sstevel@tonic-gate 	 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
6450Sstevel@tonic-gate 	 * The initial few responses after the interface is repaired may
6460Sstevel@tonic-gate 	 * contain high rtt's because they could have been queued up waiting
6470Sstevel@tonic-gate 	 * for ARP/NDP resolution on a failed interface.
6480Sstevel@tonic-gate 	 */
6490Sstevel@tonic-gate 	pg = pii->pii_phyint->pi_group;
6500Sstevel@tonic-gate 	if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
6510Sstevel@tonic-gate 		return;
6520Sstevel@tonic-gate 
6530Sstevel@tonic-gate 	/*
6540Sstevel@tonic-gate 	 * Update rtt only if the new rtt is lower than the current rtt.
6550Sstevel@tonic-gate 	 * (specified by the 3rd parameter to pi_set_crtt).
6560Sstevel@tonic-gate 	 * If a spike has caused the current probe_interval to be >
6570Sstevel@tonic-gate 	 * user_probe_interval, then this mechanism is used to bring down
6580Sstevel@tonic-gate 	 * the rtt rapidly once the network stress is removed.
6590Sstevel@tonic-gate 	 * If the new rtt is higher than the current rtt, we don't want to
6600Sstevel@tonic-gate 	 * update the rtt. We are having more than 1 outstanding probe and
6610Sstevel@tonic-gate 	 * the increase in rtt we are seeing is being unnecessarily weighted
6620Sstevel@tonic-gate 	 * many times. The regular rtt update will be handled by
6630Sstevel@tonic-gate 	 * incoming_echo_reply() and will take care of any rtt increase.
6640Sstevel@tonic-gate 	 */
6650Sstevel@tonic-gate 	pi_set_crtt(target, m, _B_FALSE);
6660Sstevel@tonic-gate 	if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
6670Sstevel@tonic-gate 	    (user_failure_detection_time < pg->pg_fdt) &&
6680Sstevel@tonic-gate 	    (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
6690Sstevel@tonic-gate 		/*
6700Sstevel@tonic-gate 		 * If the crtt has now dropped by a factor of LOWER_FT_TRIGGER,
6710Sstevel@tonic-gate 		 * investigate if we can improve the failure detection time to
6720Sstevel@tonic-gate 		 * meet whatever the user specified.
6730Sstevel@tonic-gate 		 */
6740Sstevel@tonic-gate 		if (check_pg_crtt_improved(pg)) {
6750Sstevel@tonic-gate 			pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
6760Sstevel@tonic-gate 			    user_failure_detection_time);
6770Sstevel@tonic-gate 			pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
6780Sstevel@tonic-gate 			if (pii->pii_phyint->pi_group != phyint_anongroup) {
6790Sstevel@tonic-gate 				logerr("Improved failure detection time %d ms "
6800Sstevel@tonic-gate 				    "on (%s %s) for group \"%s\"\n",
6810Sstevel@tonic-gate 				    pg->pg_fdt, AF_STR(pii->pii_af),
6820Sstevel@tonic-gate 				    pii->pii_name,
6830Sstevel@tonic-gate 				    pii->pii_phyint->pi_group->pg_name);
6840Sstevel@tonic-gate 			}
6850Sstevel@tonic-gate 			if (user_failure_detection_time == pg->pg_fdt) {
6860Sstevel@tonic-gate 				/* Avoid any truncation or rounding errors */
6870Sstevel@tonic-gate 				pg->pg_probeint = user_probe_interval;
6880Sstevel@tonic-gate 				/*
6890Sstevel@tonic-gate 				 * No more rtt probes will be sent. The actual
6900Sstevel@tonic-gate 				 * fdt has dropped to the user specified value.
6910Sstevel@tonic-gate 				 * pii_fd_snxt_basetime and pii_snxt_basetime
6920Sstevel@tonic-gate 				 * will be in sync henceforth.
6930Sstevel@tonic-gate 				 */
6940Sstevel@tonic-gate 				reset_snxt_basetimes();
6950Sstevel@tonic-gate 			}
6960Sstevel@tonic-gate 		}
6970Sstevel@tonic-gate 	}
6980Sstevel@tonic-gate }
6990Sstevel@tonic-gate 
7000Sstevel@tonic-gate /*
7010Sstevel@tonic-gate  * Process the incoming echo reply, in response to our unicast probe.
7020Sstevel@tonic-gate  * Common for both IPv4 and IPv6
7030Sstevel@tonic-gate  */
7040Sstevel@tonic-gate static void
incoming_echo_reply(struct phyint_instance * pii,struct pr_icmp * reply,struct in6_addr fromaddr,struct timeval * recv_tvp)7050Sstevel@tonic-gate incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply,
7068485SPeter.Memishian@Sun.COM     struct in6_addr fromaddr, struct timeval *recv_tvp)
7070Sstevel@tonic-gate {
7088485SPeter.Memishian@Sun.COM 	int64_t	m;		/* rtt measurement in ns */
7098485SPeter.Memishian@Sun.COM 	hrtime_t cur_hrtime;	/* in ns from some arbitrary point */
7100Sstevel@tonic-gate 	char	abuf[INET6_ADDRSTRLEN];
7110Sstevel@tonic-gate 	int	pr_ndx;
7120Sstevel@tonic-gate 	struct	target	*target;
7130Sstevel@tonic-gate 	boolean_t exception;
7148485SPeter.Memishian@Sun.COM 	uint64_t pr_icmp_timestamp;
7150Sstevel@tonic-gate 	uint16_t pr_icmp_seq;
7168485SPeter.Memishian@Sun.COM 	struct	probe_stats *pr_statp;
7170Sstevel@tonic-gate 	struct 	phyint_group *pg = pii->pii_phyint->pi_group;
7180Sstevel@tonic-gate 
7190Sstevel@tonic-gate 	/* Get the printable address for error reporting */
7200Sstevel@tonic-gate 	(void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf));
7210Sstevel@tonic-gate 
7220Sstevel@tonic-gate 	if (debug & D_PROBE) {
7238485SPeter.Memishian@Sun.COM 		logdebug("incoming_echo_reply: %s %s %s seq %u recv_tvp %lld\n",
7240Sstevel@tonic-gate 		    AF_STR(pii->pii_af), pii->pii_name, abuf,
7258485SPeter.Memishian@Sun.COM 		    ntohs(reply->pr_icmp_seq), tv2ns(recv_tvp));
7260Sstevel@tonic-gate 	}
7270Sstevel@tonic-gate 
7288485SPeter.Memishian@Sun.COM 	pr_icmp_timestamp = ntohll(reply->pr_icmp_timestamp);
7298485SPeter.Memishian@Sun.COM 	pr_icmp_seq = ntohs(reply->pr_icmp_seq);
7300Sstevel@tonic-gate 
7310Sstevel@tonic-gate 	/* Reject out of window probe replies */
7320Sstevel@tonic-gate 	if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) ||
7330Sstevel@tonic-gate 	    SEQ_LT(pr_icmp_seq, pii->pii_snxt - PROBE_STATS_COUNT)) {
7340Sstevel@tonic-gate 		logtrace("out of window probe seq %u snxt %u on %s from %s\n",
7350Sstevel@tonic-gate 		    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
7360Sstevel@tonic-gate 		pii->pii_cum_stats.unknown++;
7370Sstevel@tonic-gate 		return;
7380Sstevel@tonic-gate 	}
7398485SPeter.Memishian@Sun.COM 
7408485SPeter.Memishian@Sun.COM 	cur_hrtime = gethrtime();
7418485SPeter.Memishian@Sun.COM 	m = (int64_t)(cur_hrtime - pr_icmp_timestamp);
7420Sstevel@tonic-gate 	if (m < 0) {
7430Sstevel@tonic-gate 		/*
7440Sstevel@tonic-gate 		 * This is a ridiculously high value of rtt. rtt has wrapped
7450Sstevel@tonic-gate 		 * around. Log a message, and ignore the rtt.
7460Sstevel@tonic-gate 		 */
7478485SPeter.Memishian@Sun.COM 		logerr("incoming_echo_reply: rtt wraparound cur_hrtime %lld "
7488485SPeter.Memishian@Sun.COM 		    "reply timestamp %lld\n", cur_hrtime, pr_icmp_timestamp);
7490Sstevel@tonic-gate 	}
7500Sstevel@tonic-gate 
7510Sstevel@tonic-gate 	/*
7520Sstevel@tonic-gate 	 * Get the probe index pr_ndx corresponding to the received icmp seq.
7530Sstevel@tonic-gate 	 * number in our pii->pii_probes[] array. The icmp sequence number
7540Sstevel@tonic-gate 	 * pii_snxt corresponds to the probe index pii->pii_probe_next
7550Sstevel@tonic-gate 	 */
7560Sstevel@tonic-gate 	pr_ndx = MOD_SUB(pii->pii_probe_next,
7570Sstevel@tonic-gate 	    (uint16_t)(pii->pii_snxt - pr_icmp_seq), PROBE_STATS_COUNT);
7580Sstevel@tonic-gate 
7590Sstevel@tonic-gate 	assert(PR_STATUS_VALID(pii->pii_probes[pr_ndx].pr_status));
7600Sstevel@tonic-gate 
7610Sstevel@tonic-gate 	target = pii->pii_probes[pr_ndx].pr_target;
7620Sstevel@tonic-gate 
7630Sstevel@tonic-gate 	/*
7640Sstevel@tonic-gate 	 * Perform sanity checks, whether this probe reply that we
7650Sstevel@tonic-gate 	 * have received is genuine
7660Sstevel@tonic-gate 	 */
7670Sstevel@tonic-gate 	if (target != NULL) {
7680Sstevel@tonic-gate 		/*
7690Sstevel@tonic-gate 		 * Compare the src. addr of the received ICMP or ICMPv6
7700Sstevel@tonic-gate 		 * probe reply with the target address in our tables.
7710Sstevel@tonic-gate 		 */
7720Sstevel@tonic-gate 		if (!IN6_ARE_ADDR_EQUAL(&target->tg_address, &fromaddr)) {
7730Sstevel@tonic-gate 			/*
7740Sstevel@tonic-gate 			 * We don't have any record of having sent a probe to
7750Sstevel@tonic-gate 			 * this target. This is a fake probe reply. Log an error
7760Sstevel@tonic-gate 			 */
7770Sstevel@tonic-gate 			logtrace("probe status %d Fake probe reply seq %u "
7780Sstevel@tonic-gate 			    "snxt %u on %s from %s\n",
7790Sstevel@tonic-gate 			    pii->pii_probes[pr_ndx].pr_status,
7800Sstevel@tonic-gate 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
7810Sstevel@tonic-gate 			pii->pii_cum_stats.unknown++;
7820Sstevel@tonic-gate 			return;
7830Sstevel@tonic-gate 		} else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
7840Sstevel@tonic-gate 			/*
7850Sstevel@tonic-gate 			 * The address matches, but our tables indicate that
7860Sstevel@tonic-gate 			 * this probe reply has been acked already. So this
7870Sstevel@tonic-gate 			 * is a duplicate probe reply. Log an error
7880Sstevel@tonic-gate 			 */
7890Sstevel@tonic-gate 			logtrace("probe status %d Duplicate probe reply seq %u "
7900Sstevel@tonic-gate 			    "snxt %u on %s from %s\n",
7910Sstevel@tonic-gate 			    pii->pii_probes[pr_ndx].pr_status,
7920Sstevel@tonic-gate 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
7930Sstevel@tonic-gate 			pii->pii_cum_stats.unknown++;
7940Sstevel@tonic-gate 			return;
7950Sstevel@tonic-gate 		}
7960Sstevel@tonic-gate 	} else {
7970Sstevel@tonic-gate 		/*
7980Sstevel@tonic-gate 		 * Target must not be NULL in the PR_UNACKED state
7990Sstevel@tonic-gate 		 */
8000Sstevel@tonic-gate 		assert(pii->pii_probes[pr_ndx].pr_status != PR_UNACKED);
8010Sstevel@tonic-gate 		if (pii->pii_probes[pr_ndx].pr_status == PR_UNUSED) {
8020Sstevel@tonic-gate 			/*
8030Sstevel@tonic-gate 			 * The probe stats slot is unused. So we didn't
8040Sstevel@tonic-gate 			 * send out any probe to this target. This is a fake.
8050Sstevel@tonic-gate 			 * Log an error.
8060Sstevel@tonic-gate 			 */
8070Sstevel@tonic-gate 			logtrace("probe status %d Fake probe reply seq %u "
8080Sstevel@tonic-gate 			    "snxt %u on %s from %s\n",
8090Sstevel@tonic-gate 			    pii->pii_probes[pr_ndx].pr_status,
8100Sstevel@tonic-gate 			    pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf);
8110Sstevel@tonic-gate 		}
8120Sstevel@tonic-gate 		pii->pii_cum_stats.unknown++;
8130Sstevel@tonic-gate 		return;
8140Sstevel@tonic-gate 	}
8150Sstevel@tonic-gate 
8160Sstevel@tonic-gate 	/*
8170Sstevel@tonic-gate 	 * If the rtt does not appear to be right, don't update the
8180Sstevel@tonic-gate 	 * rtt stats. This can happen if the system dropped into the
8190Sstevel@tonic-gate 	 * debugger, or the system was hung or too busy for a
8200Sstevel@tonic-gate 	 * substantial time that we didn't get a chance to run.
8210Sstevel@tonic-gate 	 */
8228485SPeter.Memishian@Sun.COM 	if ((m < 0) || (ns2ms(m) > PROBE_STATS_COUNT * pg->pg_probeint)) {
8230Sstevel@tonic-gate 		/*
8248485SPeter.Memishian@Sun.COM 		 * If the probe corresponding to this received response
8258485SPeter.Memishian@Sun.COM 		 * was truly sent 'm' ns. ago, then this response must
8260Sstevel@tonic-gate 		 * have been rejected by the sequence number checks. The
8270Sstevel@tonic-gate 		 * fact that it has passed the sequence number checks
8280Sstevel@tonic-gate 		 * means that the measured rtt is wrong. We were probably
8290Sstevel@tonic-gate 		 * scheduled long after the packet was received.
8300Sstevel@tonic-gate 		 */
8310Sstevel@tonic-gate 		goto out;
8320Sstevel@tonic-gate 	}
8330Sstevel@tonic-gate 
8340Sstevel@tonic-gate 	/*
8350Sstevel@tonic-gate 	 * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses
8360Sstevel@tonic-gate 	 * The initial few responses after the interface is repaired may
8370Sstevel@tonic-gate 	 * contain high rtt's because they could have been queued up waiting
8380Sstevel@tonic-gate 	 * for ARP/NDP resolution on a failed interface.
8390Sstevel@tonic-gate 	 */
8400Sstevel@tonic-gate 	if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg))
8410Sstevel@tonic-gate 		goto out;
8420Sstevel@tonic-gate 
8430Sstevel@tonic-gate 	/*
8440Sstevel@tonic-gate 	 * Don't update the Conservative Round Trip Time estimate for this
8450Sstevel@tonic-gate 	 * (phint, target) pair if this is the not the highest ack seq seen
8460Sstevel@tonic-gate 	 * thus far on this target.
8470Sstevel@tonic-gate 	 */
8480Sstevel@tonic-gate 	if (!highest_ack_tg(pr_icmp_seq, target))
8490Sstevel@tonic-gate 		goto out;
8500Sstevel@tonic-gate 
8510Sstevel@tonic-gate 	/*
8520Sstevel@tonic-gate 	 * Always update the rtt. This is a failure detection probe
8530Sstevel@tonic-gate 	 * and we want to measure both increase / decrease in rtt.
8540Sstevel@tonic-gate 	 */
8550Sstevel@tonic-gate 	pi_set_crtt(target, m, _B_TRUE);
8560Sstevel@tonic-gate 
8570Sstevel@tonic-gate 	/*
8580Sstevel@tonic-gate 	 * If the crtt exceeds the average time between probes,
8590Sstevel@tonic-gate 	 * investigate if this slow target is an exception. If so we
8600Sstevel@tonic-gate 	 * can avoid this target and still meet the failure detection
8610Sstevel@tonic-gate 	 * time. Otherwise we can't meet the failure detection time.
8620Sstevel@tonic-gate 	 */
8630Sstevel@tonic-gate 	if (target->tg_crtt > pg->pg_probeint) {
8640Sstevel@tonic-gate 		exception = check_exception_target(pii, target);
8650Sstevel@tonic-gate 		if (exception) {
8660Sstevel@tonic-gate 			/*
8670Sstevel@tonic-gate 			 * This target is exceptionally slow. Don't use it
8680Sstevel@tonic-gate 			 * for future probes. check_exception_target() has
8690Sstevel@tonic-gate 			 * made sure that we have at least MIN_PROBE_TARGETS
8700Sstevel@tonic-gate 			 * other active targets
8710Sstevel@tonic-gate 			 */
8720Sstevel@tonic-gate 			if (pii->pii_targets_are_routers) {
8730Sstevel@tonic-gate 				/*
8740Sstevel@tonic-gate 				 * This is a slow router, mark it as slow
8750Sstevel@tonic-gate 				 * and don't use it for further probes. We
8760Sstevel@tonic-gate 				 * don't delete it, since it will be populated
8770Sstevel@tonic-gate 				 * again when we do a router scan. Hence we
8780Sstevel@tonic-gate 				 * need to maintain extra state (unlike the
8790Sstevel@tonic-gate 				 * host case below).  Mark it as TG_SLOW.
8800Sstevel@tonic-gate 				 */
8810Sstevel@tonic-gate 				if (target->tg_status == TG_ACTIVE)
8820Sstevel@tonic-gate 					pii->pii_ntargets--;
8830Sstevel@tonic-gate 				target->tg_status = TG_SLOW;
8840Sstevel@tonic-gate 				target->tg_latime = gethrtime();
8850Sstevel@tonic-gate 				target->tg_rtt_sa = -1;
8860Sstevel@tonic-gate 				target->tg_crtt = 0;
8870Sstevel@tonic-gate 				target->tg_rtt_sd = 0;
8880Sstevel@tonic-gate 				if (pii->pii_target_next == target) {
8890Sstevel@tonic-gate 					pii->pii_target_next =
8900Sstevel@tonic-gate 					    target_next(target);
8910Sstevel@tonic-gate 				}
8920Sstevel@tonic-gate 			} else {
8930Sstevel@tonic-gate 				/*
8940Sstevel@tonic-gate 				 * the slow target is not a router, we can
8950Sstevel@tonic-gate 				 * just delete it. Send an icmp multicast and
8960Sstevel@tonic-gate 				 * pick the fastest responder that is not
8970Sstevel@tonic-gate 				 * already an active target. target_delete()
8980Sstevel@tonic-gate 				 * adjusts pii->pii_target_next
8990Sstevel@tonic-gate 				 */
9000Sstevel@tonic-gate 				target_delete(target);
9018485SPeter.Memishian@Sun.COM 				probe(pii, PROBE_MULTI, cur_hrtime);
9020Sstevel@tonic-gate 			}
9030Sstevel@tonic-gate 		} else {
9040Sstevel@tonic-gate 			/*
9050Sstevel@tonic-gate 			 * We can't meet the failure detection time.
9060Sstevel@tonic-gate 			 * Log a message, and update the detection time to
9070Sstevel@tonic-gate 			 * whatever we can achieve.
9080Sstevel@tonic-gate 			 */
9090Sstevel@tonic-gate 			pg->pg_probeint = target->tg_crtt * NEXT_FDT_MULTIPLE;
9100Sstevel@tonic-gate 			pg->pg_fdt = pg->pg_probeint * (NUM_PROBE_FAILS + 2);
9110Sstevel@tonic-gate 			last_fdt_bumpup_time = gethrtime();
9120Sstevel@tonic-gate 			if (pg != phyint_anongroup) {
913*10649SPeter.Memishian@Sun.COM 				logtrace("Cannot meet requested failure"
914*10649SPeter.Memishian@Sun.COM 				    " detection time of %d ms on (%s %s) new"
915*10649SPeter.Memishian@Sun.COM 				    " failure detection time for group \"%s\""
916*10649SPeter.Memishian@Sun.COM 				    " is %d ms\n", user_failure_detection_time,
9170Sstevel@tonic-gate 				    AF_STR(pii->pii_af), pii->pii_name,
9180Sstevel@tonic-gate 				    pg->pg_name, pg->pg_fdt);
9190Sstevel@tonic-gate 			}
9200Sstevel@tonic-gate 		}
9210Sstevel@tonic-gate 	} else if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) &&
9220Sstevel@tonic-gate 	    (user_failure_detection_time < pg->pg_fdt) &&
9230Sstevel@tonic-gate 	    (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) {
9240Sstevel@tonic-gate 		/*
9250Sstevel@tonic-gate 		 * If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER
9260Sstevel@tonic-gate 		 * investigate if we can improve the failure detection time to
9270Sstevel@tonic-gate 		 * meet whatever the user specified.
9280Sstevel@tonic-gate 		 */
9290Sstevel@tonic-gate 		if (check_pg_crtt_improved(pg)) {
9300Sstevel@tonic-gate 			pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE,
9310Sstevel@tonic-gate 			    user_failure_detection_time);
9320Sstevel@tonic-gate 			pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2);
9330Sstevel@tonic-gate 			if (pg != phyint_anongroup) {
934*10649SPeter.Memishian@Sun.COM 				logtrace("Improved failure detection time %d ms"
935*10649SPeter.Memishian@Sun.COM 				    " on (%s %s) for group \"%s\"\n",
936*10649SPeter.Memishian@Sun.COM 				    pg->pg_fdt, AF_STR(pii->pii_af),
937*10649SPeter.Memishian@Sun.COM 				    pii->pii_name, pg->pg_name);
9380Sstevel@tonic-gate 			}
9390Sstevel@tonic-gate 			if (user_failure_detection_time == pg->pg_fdt) {
9400Sstevel@tonic-gate 				/* Avoid any truncation or rounding errors */
9410Sstevel@tonic-gate 				pg->pg_probeint = user_probe_interval;
9420Sstevel@tonic-gate 				/*
9430Sstevel@tonic-gate 				 * No more rtt probes will be sent. The actual
9440Sstevel@tonic-gate 				 * fdt has dropped to the user specified value.
9450Sstevel@tonic-gate 				 * pii_fd_snxt_basetime and pii_snxt_basetime
9460Sstevel@tonic-gate 				 * will be in sync henceforth.
9470Sstevel@tonic-gate 				 */
9480Sstevel@tonic-gate 				reset_snxt_basetimes();
9490Sstevel@tonic-gate 			}
9500Sstevel@tonic-gate 		}
9510Sstevel@tonic-gate 	}
9520Sstevel@tonic-gate out:
9538485SPeter.Memishian@Sun.COM 	pr_statp = &pii->pii_probes[pr_ndx];
9548485SPeter.Memishian@Sun.COM 	pr_statp->pr_hrtime_ackproc = cur_hrtime;
9558485SPeter.Memishian@Sun.COM 	pr_statp->pr_hrtime_ackrecv = pr_statp->pr_hrtime_sent +
9568485SPeter.Memishian@Sun.COM 	    (tv2ns(recv_tvp) - tv2ns(&pr_statp->pr_tv_sent));
9578485SPeter.Memishian@Sun.COM 
9588485SPeter.Memishian@Sun.COM 	probe_chstate(pr_statp, pii, PR_ACKED);
9590Sstevel@tonic-gate 
9600Sstevel@tonic-gate 	/*
9610Sstevel@tonic-gate 	 * Update pii->pii_rack, i.e. the sequence number of the last received
9620Sstevel@tonic-gate 	 * probe response, based on the echo reply we have received now, if
9630Sstevel@tonic-gate 	 * either of the following conditions are satisfied.
9640Sstevel@tonic-gate 	 * a. pii_rack is outside the current receive window of
9650Sstevel@tonic-gate 	 *    [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt).
9660Sstevel@tonic-gate 	 *    This means we have not received probe responses for a
9670Sstevel@tonic-gate 	 *    long time, and the sequence number has wrapped around.
9680Sstevel@tonic-gate 	 * b. pii_rack is within the current receive window and this echo
9690Sstevel@tonic-gate 	 *    reply corresponds to the highest sequence number we have seen
9700Sstevel@tonic-gate 	 *    so far.
9710Sstevel@tonic-gate 	 */
9720Sstevel@tonic-gate 	if (SEQ_GE(pii->pii_rack, pii->pii_snxt) ||
9730Sstevel@tonic-gate 	    SEQ_LT(pii->pii_rack, pii->pii_snxt - PROBE_STATS_COUNT) ||
9740Sstevel@tonic-gate 	    SEQ_GT(pr_icmp_seq, pii->pii_rack)) {
9750Sstevel@tonic-gate 		pii->pii_rack = pr_icmp_seq;
9760Sstevel@tonic-gate 	}
9770Sstevel@tonic-gate }
9780Sstevel@tonic-gate 
9790Sstevel@tonic-gate /*
9800Sstevel@tonic-gate  * Returns true if seq is the highest unacknowledged seq for target tg
9810Sstevel@tonic-gate  * else returns false
9820Sstevel@tonic-gate  */
9830Sstevel@tonic-gate static boolean_t
highest_ack_tg(uint16_t seq,struct target * tg)9840Sstevel@tonic-gate highest_ack_tg(uint16_t seq, struct target *tg)
9850Sstevel@tonic-gate {
9860Sstevel@tonic-gate 	struct phyint_instance *pii;
9870Sstevel@tonic-gate 	int	 pr_ndx;
9880Sstevel@tonic-gate 	uint16_t pr_seq;
9890Sstevel@tonic-gate 
9900Sstevel@tonic-gate 	pii = tg->tg_phyint_inst;
9910Sstevel@tonic-gate 
9920Sstevel@tonic-gate 	/*
9930Sstevel@tonic-gate 	 * Get the seq number of the most recent probe sent so far,
9940Sstevel@tonic-gate 	 * and also get the corresponding probe index in the probe stats
9950Sstevel@tonic-gate 	 * array.
9960Sstevel@tonic-gate 	 */
9970Sstevel@tonic-gate 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
9980Sstevel@tonic-gate 	pr_seq = pii->pii_snxt;
9990Sstevel@tonic-gate 	pr_seq--;
10000Sstevel@tonic-gate 
10010Sstevel@tonic-gate 	/*
10020Sstevel@tonic-gate 	 * Start from the most recent probe and walk back, trying to find
10030Sstevel@tonic-gate 	 * an acked probe corresponding to target tg.
10040Sstevel@tonic-gate 	 */
10050Sstevel@tonic-gate 	for (; pr_ndx != pii->pii_probe_next;
10060Sstevel@tonic-gate 	    pr_ndx = PROBE_INDEX_PREV(pr_ndx), pr_seq--) {
10070Sstevel@tonic-gate 		if (pii->pii_probes[pr_ndx].pr_target == tg &&
10080Sstevel@tonic-gate 		    pii->pii_probes[pr_ndx].pr_status == PR_ACKED) {
10090Sstevel@tonic-gate 			if (SEQ_GT(pr_seq, seq))
10100Sstevel@tonic-gate 				return (_B_FALSE);
10110Sstevel@tonic-gate 		}
10120Sstevel@tonic-gate 	}
10130Sstevel@tonic-gate 	return (_B_TRUE);
10140Sstevel@tonic-gate }
10150Sstevel@tonic-gate 
10160Sstevel@tonic-gate /*
10170Sstevel@tonic-gate  * Check whether the crtt for the group has improved by a factor of
10180Sstevel@tonic-gate  * LOWER_FDT_TRIGGER.  Small crtt improvements are ignored to avoid failure
10190Sstevel@tonic-gate  * detection time flapping in the face of small crtt changes.
10200Sstevel@tonic-gate  */
10210Sstevel@tonic-gate static boolean_t
check_pg_crtt_improved(struct phyint_group * pg)10220Sstevel@tonic-gate check_pg_crtt_improved(struct phyint_group *pg)
10230Sstevel@tonic-gate {
10240Sstevel@tonic-gate 	struct	phyint *pi;
10250Sstevel@tonic-gate 
10260Sstevel@tonic-gate 	if (debug & D_PROBE)
10270Sstevel@tonic-gate 		logdebug("check_pg_crtt_improved()\n");
10280Sstevel@tonic-gate 
10290Sstevel@tonic-gate 	/*
10300Sstevel@tonic-gate 	 * The crtt for the group is only improved if each phyint_instance
10310Sstevel@tonic-gate 	 * for both ipv4 and ipv6 is improved.
10320Sstevel@tonic-gate 	 */
10330Sstevel@tonic-gate 	for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) {
10340Sstevel@tonic-gate 		if (!check_pii_crtt_improved(pi->pi_v4) ||
10350Sstevel@tonic-gate 		    !check_pii_crtt_improved(pi->pi_v6))
10360Sstevel@tonic-gate 			return (_B_FALSE);
10370Sstevel@tonic-gate 	}
10380Sstevel@tonic-gate 
10390Sstevel@tonic-gate 	return (_B_TRUE);
10400Sstevel@tonic-gate }
10410Sstevel@tonic-gate 
10420Sstevel@tonic-gate /*
10430Sstevel@tonic-gate  * Check whether the crtt has improved substantially on this phyint_instance.
10440Sstevel@tonic-gate  * Returns _B_TRUE if there's no crtt information available, because pii
10450Sstevel@tonic-gate  * is NULL or the phyint_instance is not capable of probing.
10460Sstevel@tonic-gate  */
10470Sstevel@tonic-gate boolean_t
check_pii_crtt_improved(struct phyint_instance * pii)10480Sstevel@tonic-gate check_pii_crtt_improved(struct phyint_instance *pii) {
10490Sstevel@tonic-gate 	struct 	target *tg;
10500Sstevel@tonic-gate 
10510Sstevel@tonic-gate 	if (pii == NULL)
10520Sstevel@tonic-gate 		return (_B_TRUE);
10530Sstevel@tonic-gate 
10540Sstevel@tonic-gate 	if (!PROBE_CAPABLE(pii) ||
10550Sstevel@tonic-gate 	    pii->pii_phyint->pi_state == PI_FAILED)
10560Sstevel@tonic-gate 		return (_B_TRUE);
10570Sstevel@tonic-gate 
10580Sstevel@tonic-gate 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
10590Sstevel@tonic-gate 		if (tg->tg_status != TG_ACTIVE)
10600Sstevel@tonic-gate 			continue;
10610Sstevel@tonic-gate 		if (tg->tg_crtt > (pii->pii_phyint->pi_group->pg_probeint /
10620Sstevel@tonic-gate 		    LOWER_FDT_TRIGGER)) {
10630Sstevel@tonic-gate 			return (_B_FALSE);
10640Sstevel@tonic-gate 		}
10650Sstevel@tonic-gate 	}
10660Sstevel@tonic-gate 
10670Sstevel@tonic-gate 	return (_B_TRUE);
10680Sstevel@tonic-gate }
10690Sstevel@tonic-gate 
10700Sstevel@tonic-gate /*
10710Sstevel@tonic-gate  * This target responds very slowly to probes. The target's crtt exceeds
10720Sstevel@tonic-gate  * the probe interval of its group. Compare against other targets
10730Sstevel@tonic-gate  * and determine if this target is an exception, if so return true, else false
10740Sstevel@tonic-gate  */
10750Sstevel@tonic-gate static boolean_t
check_exception_target(struct phyint_instance * pii,struct target * target)10760Sstevel@tonic-gate check_exception_target(struct phyint_instance *pii, struct target *target)
10770Sstevel@tonic-gate {
10780Sstevel@tonic-gate 	struct	target *tg;
10790Sstevel@tonic-gate 	char abuf[INET6_ADDRSTRLEN];
10800Sstevel@tonic-gate 
10810Sstevel@tonic-gate 	if (debug & D_PROBE) {
10820Sstevel@tonic-gate 		logdebug("check_exception_target(%s %s target %s)\n",
10830Sstevel@tonic-gate 		    AF_STR(pii->pii_af), pii->pii_name,
10840Sstevel@tonic-gate 		    pr_addr(pii->pii_af, target->tg_address,
10854929Srk129064 		    abuf, sizeof (abuf)));
10860Sstevel@tonic-gate 	}
10870Sstevel@tonic-gate 
10880Sstevel@tonic-gate 	/*
10890Sstevel@tonic-gate 	 * We should have at least MIN_PROBE_TARGETS + 1 good targets now,
10900Sstevel@tonic-gate 	 * to make a good judgement. Otherwise don't drop this target.
10910Sstevel@tonic-gate 	 */
10920Sstevel@tonic-gate 	if (pii->pii_ntargets <  MIN_PROBE_TARGETS + 1)
10930Sstevel@tonic-gate 		return (_B_FALSE);
10940Sstevel@tonic-gate 
10950Sstevel@tonic-gate 	/*
10960Sstevel@tonic-gate 	 * Determine whether only this particular target is slow.
10970Sstevel@tonic-gate 	 * We know that this target's crtt exceeds the group's probe interval.
10980Sstevel@tonic-gate 	 * If all other active targets have a
10990Sstevel@tonic-gate 	 * crtt < (this group's probe interval) / EXCEPTION_FACTOR,
11000Sstevel@tonic-gate 	 * then this target is considered slow.
11010Sstevel@tonic-gate 	 */
11020Sstevel@tonic-gate 	for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
11030Sstevel@tonic-gate 		if (tg != target && tg->tg_status == TG_ACTIVE) {
11040Sstevel@tonic-gate 			if (tg->tg_crtt >
11050Sstevel@tonic-gate 			    pii->pii_phyint->pi_group->pg_probeint /
11060Sstevel@tonic-gate 			    EXCEPTION_FACTOR) {
11070Sstevel@tonic-gate 				return (_B_FALSE);
11080Sstevel@tonic-gate 			}
11090Sstevel@tonic-gate 		}
11100Sstevel@tonic-gate 	}
11110Sstevel@tonic-gate 
11120Sstevel@tonic-gate 	return (_B_TRUE);
11130Sstevel@tonic-gate }
11140Sstevel@tonic-gate 
11150Sstevel@tonic-gate /*
11160Sstevel@tonic-gate  * Update the target list. The icmp all hosts multicast has given us
11170Sstevel@tonic-gate  * some host to which we can send probes. If we already have sufficient
11180Sstevel@tonic-gate  * targets, discard it.
11190Sstevel@tonic-gate  */
11200Sstevel@tonic-gate static void
incoming_mcast_reply(struct phyint_instance * pii,struct pr_icmp * reply,struct in6_addr fromaddr)11210Sstevel@tonic-gate incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply,
11220Sstevel@tonic-gate     struct in6_addr fromaddr)
11230Sstevel@tonic-gate /* ARGSUSED */
11240Sstevel@tonic-gate {
11250Sstevel@tonic-gate 	int af;
11260Sstevel@tonic-gate 	char abuf[INET6_ADDRSTRLEN];
11270Sstevel@tonic-gate 	struct phyint *pi;
11280Sstevel@tonic-gate 
11290Sstevel@tonic-gate 	if (debug & D_PROBE) {
11300Sstevel@tonic-gate 		logdebug("incoming_mcast_reply(%s %s %s)\n",
11310Sstevel@tonic-gate 		    AF_STR(pii->pii_af), pii->pii_name,
11320Sstevel@tonic-gate 		    pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)));
11330Sstevel@tonic-gate 	}
11340Sstevel@tonic-gate 
11350Sstevel@tonic-gate 	/*
11360Sstevel@tonic-gate 	 * Using host targets is a fallback mechanism. If we have
11370Sstevel@tonic-gate 	 * found a router, don't add this host target. If we already
11380Sstevel@tonic-gate 	 * know MAX_PROBE_TARGETS, don't add another target.
11390Sstevel@tonic-gate 	 */
11400Sstevel@tonic-gate 	assert(pii->pii_ntargets <= MAX_PROBE_TARGETS);
11410Sstevel@tonic-gate 	if (pii->pii_targets != NULL) {
11420Sstevel@tonic-gate 		if (pii->pii_targets_are_routers ||
11430Sstevel@tonic-gate 		    (pii->pii_ntargets == MAX_PROBE_TARGETS)) {
11440Sstevel@tonic-gate 			return;
11450Sstevel@tonic-gate 		}
11460Sstevel@tonic-gate 	}
11470Sstevel@tonic-gate 
11480Sstevel@tonic-gate 	if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) ||
11490Sstevel@tonic-gate 	    IN6_IS_ADDR_V4MAPPED_ANY(&fromaddr)) {
11500Sstevel@tonic-gate 		/*
11510Sstevel@tonic-gate 		 * Guard against response from 0.0.0.0
11520Sstevel@tonic-gate 		 * and ::. Log a trace message
11530Sstevel@tonic-gate 		 */
11540Sstevel@tonic-gate 		logtrace("probe response from %s on %s\n",
11550Sstevel@tonic-gate 		    pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)),
11560Sstevel@tonic-gate 		    pii->pii_name);
11570Sstevel@tonic-gate 		return;
11580Sstevel@tonic-gate 	}
11590Sstevel@tonic-gate 
11600Sstevel@tonic-gate 	/*
11610Sstevel@tonic-gate 	 * This address is one of our own, so reject this address as a
11620Sstevel@tonic-gate 	 * valid probe target.
11630Sstevel@tonic-gate 	 */
11640Sstevel@tonic-gate 	af = pii->pii_af;
11652250Srk129064 	if (own_address(fromaddr))
11660Sstevel@tonic-gate 		return;
11670Sstevel@tonic-gate 
11680Sstevel@tonic-gate 	/*
11690Sstevel@tonic-gate 	 * If the phyint is part a named group, then add the address to all
11700Sstevel@tonic-gate 	 * members of the group.  Otherwise, add the address only to the
11710Sstevel@tonic-gate 	 * phyint itself, since other phyints in the anongroup may not be on
11720Sstevel@tonic-gate 	 * the same subnet.
11730Sstevel@tonic-gate 	 */
11740Sstevel@tonic-gate 	pi = pii->pii_phyint;
11750Sstevel@tonic-gate 	if (pi->pi_group == phyint_anongroup) {
11760Sstevel@tonic-gate 		target_add(pii, fromaddr, _B_FALSE);
11770Sstevel@tonic-gate 	} else {
11780Sstevel@tonic-gate 		pi = pi->pi_group->pg_phyint;
11790Sstevel@tonic-gate 		for (; pi != NULL; pi = pi->pi_pgnext)
11800Sstevel@tonic-gate 			target_add(PHYINT_INSTANCE(pi, af), fromaddr, _B_FALSE);
11810Sstevel@tonic-gate 	}
11820Sstevel@tonic-gate }
11830Sstevel@tonic-gate 
11840Sstevel@tonic-gate /*
11850Sstevel@tonic-gate  * Compute CRTT given an existing scaled average, scaled deviation estimate
11860Sstevel@tonic-gate  * and a new rtt time.  The formula is from Jacobson and Karels'
11870Sstevel@tonic-gate  * "Congestion Avoidance and Control" in SIGCOMM '88.  The variable names
11880Sstevel@tonic-gate  * are the same as those in Appendix A.2 of that paper.
11890Sstevel@tonic-gate  *
11900Sstevel@tonic-gate  * m = new measurement
11910Sstevel@tonic-gate  * sa = scaled RTT average (8 * average estimates)
11920Sstevel@tonic-gate  * sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates).
11930Sstevel@tonic-gate  * crtt = Conservative round trip time. Used to determine whether probe
11940Sstevel@tonic-gate  * has timed out.
11950Sstevel@tonic-gate  *
11960Sstevel@tonic-gate  * New scaled average and deviation are passed back via sap and svp
11970Sstevel@tonic-gate  */
11988485SPeter.Memishian@Sun.COM static int64_t
compute_crtt(int64_t * sap,int64_t * svp,int64_t m)11998485SPeter.Memishian@Sun.COM compute_crtt(int64_t *sap, int64_t *svp, int64_t m)
12000Sstevel@tonic-gate {
12018485SPeter.Memishian@Sun.COM 	int64_t sa = *sap;
12028485SPeter.Memishian@Sun.COM 	int64_t sv = *svp;
12038485SPeter.Memishian@Sun.COM 	int64_t crtt;
12048485SPeter.Memishian@Sun.COM 	int64_t saved_m = m;
12050Sstevel@tonic-gate 
12060Sstevel@tonic-gate 	assert(*sap >= -1);
12070Sstevel@tonic-gate 	assert(*svp >= 0);
12080Sstevel@tonic-gate 
12090Sstevel@tonic-gate 	if (sa != -1) {
12100Sstevel@tonic-gate 		/*
12110Sstevel@tonic-gate 		 * Update average estimator:
12120Sstevel@tonic-gate 		 *	new rtt = old rtt + 1/8 Error
12130Sstevel@tonic-gate 		 *	    where Error = m - old rtt
12140Sstevel@tonic-gate 		 *	i.e. 8 * new rtt = 8 * old rtt + Error
12150Sstevel@tonic-gate 		 *	i.e. new sa =  old sa + Error
12160Sstevel@tonic-gate 		 */
12170Sstevel@tonic-gate 		m -= sa >> 3;		/* m is now Error in estimate. */
12180Sstevel@tonic-gate 		if ((sa += m) < 0) {
12190Sstevel@tonic-gate 			/* Don't allow the smoothed average to be negative. */
12200Sstevel@tonic-gate 			sa = 0;
12210Sstevel@tonic-gate 		}
12220Sstevel@tonic-gate 
12230Sstevel@tonic-gate 		/*
12240Sstevel@tonic-gate 		 * Update deviation estimator:
12250Sstevel@tonic-gate 		 *	new mdev =  old mdev + 1/4 (abs(Error) - old mdev)
12260Sstevel@tonic-gate 		 *	i.e. 4 * new mdev = 4 * old mdev +
12270Sstevel@tonic-gate 		 *		(abs(Error) - old mdev)
12280Sstevel@tonic-gate 		 * 	i.e. new sv = old sv + (abs(Error) - old mdev)
12290Sstevel@tonic-gate 		 */
12300Sstevel@tonic-gate 		if (m < 0)
12310Sstevel@tonic-gate 			m = -m;
12320Sstevel@tonic-gate 		m -= sv >> 2;
12330Sstevel@tonic-gate 		sv += m;
12340Sstevel@tonic-gate 	} else {
12350Sstevel@tonic-gate 		/* Initialization. This is the first response received. */
12360Sstevel@tonic-gate 		sa = (m << 3);
12370Sstevel@tonic-gate 		sv = (m << 1);
12380Sstevel@tonic-gate 	}
12390Sstevel@tonic-gate 
12400Sstevel@tonic-gate 	crtt = (sa >> 3) + sv;
12410Sstevel@tonic-gate 
12420Sstevel@tonic-gate 	if (debug & D_PROBE) {
12438485SPeter.Memishian@Sun.COM 		logerr("compute_crtt: m = %lld sa = %lld, sv = %lld -> "
12448485SPeter.Memishian@Sun.COM 		    "crtt = %lld\n", saved_m, sa, sv, crtt);
12450Sstevel@tonic-gate 	}
12460Sstevel@tonic-gate 
12470Sstevel@tonic-gate 	*sap = sa;
12480Sstevel@tonic-gate 	*svp = sv;
12490Sstevel@tonic-gate 
12500Sstevel@tonic-gate 	/*
12510Sstevel@tonic-gate 	 * CRTT = average estimates  + 4 * deviation estimates
12520Sstevel@tonic-gate 	 *	= sa / 8 + sv
12530Sstevel@tonic-gate 	 */
12540Sstevel@tonic-gate 	return (crtt);
12550Sstevel@tonic-gate }
12560Sstevel@tonic-gate 
12570Sstevel@tonic-gate static void
pi_set_crtt(struct target * tg,int64_t m,boolean_t is_probe_uni)12588485SPeter.Memishian@Sun.COM pi_set_crtt(struct target *tg, int64_t m, boolean_t is_probe_uni)
12590Sstevel@tonic-gate {
12600Sstevel@tonic-gate 	struct phyint_instance *pii = tg->tg_phyint_inst;
12610Sstevel@tonic-gate 	int probe_interval = pii->pii_phyint->pi_group->pg_probeint;
12628485SPeter.Memishian@Sun.COM 	int64_t sa = tg->tg_rtt_sa;
12638485SPeter.Memishian@Sun.COM 	int64_t sv = tg->tg_rtt_sd;
12640Sstevel@tonic-gate 	int new_crtt;
12650Sstevel@tonic-gate 	int i;
12660Sstevel@tonic-gate 
12670Sstevel@tonic-gate 	if (debug & D_PROBE)
12688485SPeter.Memishian@Sun.COM 		logdebug("pi_set_crtt: target -  m %lld\n", m);
12690Sstevel@tonic-gate 
12700Sstevel@tonic-gate 	/* store the round trip time, in case we need to defer computation */
12710Sstevel@tonic-gate 	tg->tg_deferred[tg->tg_num_deferred] = m;
12720Sstevel@tonic-gate 
12738485SPeter.Memishian@Sun.COM 	new_crtt = ns2ms(compute_crtt(&sa, &sv, m));
12740Sstevel@tonic-gate 
12750Sstevel@tonic-gate 	/*
12760Sstevel@tonic-gate 	 * If this probe's round trip time would singlehandedly cause an
12770Sstevel@tonic-gate 	 * increase in the group's probe interval consider it suspect.
12780Sstevel@tonic-gate 	 */
12790Sstevel@tonic-gate 	if ((new_crtt > probe_interval) && is_probe_uni) {
12800Sstevel@tonic-gate 		if (debug & D_PROBE) {
12810Sstevel@tonic-gate 			logdebug("Received a suspect probe on %s, new_crtt ="
12820Sstevel@tonic-gate 			    " %d, probe_interval = %d, num_deferred = %d\n",
12830Sstevel@tonic-gate 			    pii->pii_probe_logint->li_name, new_crtt,
12840Sstevel@tonic-gate 			    probe_interval, tg->tg_num_deferred);
12850Sstevel@tonic-gate 		}
12860Sstevel@tonic-gate 
12870Sstevel@tonic-gate 		/*
12880Sstevel@tonic-gate 		 * If we've deferred as many rtts as we plan on deferring, then
12890Sstevel@tonic-gate 		 * assume the link really did slow down and process all queued
12900Sstevel@tonic-gate 		 * rtts
12910Sstevel@tonic-gate 		 */
12920Sstevel@tonic-gate 		if (tg->tg_num_deferred == MAXDEFERREDRTT) {
12930Sstevel@tonic-gate 			if (debug & D_PROBE) {
12940Sstevel@tonic-gate 				logdebug("Received MAXDEFERREDRTT probes which "
12950Sstevel@tonic-gate 				    "would cause an increased probe_interval.  "
12960Sstevel@tonic-gate 				    "Integrating queued rtt data points.\n");
12970Sstevel@tonic-gate 			}
12980Sstevel@tonic-gate 
12990Sstevel@tonic-gate 			for (i = 0; i <= tg->tg_num_deferred; i++) {
13008485SPeter.Memishian@Sun.COM 				tg->tg_crtt = ns2ms(compute_crtt(&tg->tg_rtt_sa,
13018485SPeter.Memishian@Sun.COM 				    &tg->tg_rtt_sd, tg->tg_deferred[i]));
13020Sstevel@tonic-gate 			}
13030Sstevel@tonic-gate 
13040Sstevel@tonic-gate 			tg->tg_num_deferred = 0;
13050Sstevel@tonic-gate 		} else {
13060Sstevel@tonic-gate 			tg->tg_num_deferred++;
13070Sstevel@tonic-gate 		}
13080Sstevel@tonic-gate 		return;
13090Sstevel@tonic-gate 	}
13100Sstevel@tonic-gate 
13110Sstevel@tonic-gate 	/*
13120Sstevel@tonic-gate 	 * If this is a normal probe, or an RTT probe that would lead to a
13130Sstevel@tonic-gate 	 * reduced CRTT, then update our CRTT data.  Further, if this was
13140Sstevel@tonic-gate 	 * a normal probe, pitch any deferred probes since our probes are
13150Sstevel@tonic-gate 	 * again being answered within our CRTT estimates.
13160Sstevel@tonic-gate 	 */
13170Sstevel@tonic-gate 	if (is_probe_uni || new_crtt < tg->tg_crtt) {
13180Sstevel@tonic-gate 		tg->tg_rtt_sa = sa;
13190Sstevel@tonic-gate 		tg->tg_rtt_sd = sv;
13200Sstevel@tonic-gate 		tg->tg_crtt = new_crtt;
13210Sstevel@tonic-gate 		if (is_probe_uni)
13220Sstevel@tonic-gate 			tg->tg_num_deferred = 0;
13230Sstevel@tonic-gate 	}
13240Sstevel@tonic-gate }
13250Sstevel@tonic-gate 
13260Sstevel@tonic-gate /*
13270Sstevel@tonic-gate  * Return a pointer to the specified option buffer.
13280Sstevel@tonic-gate  * If not found return NULL.
13290Sstevel@tonic-gate  */
13300Sstevel@tonic-gate static void *
find_ancillary(struct msghdr * msg,int cmsg_level,int cmsg_type)13318485SPeter.Memishian@Sun.COM find_ancillary(struct msghdr *msg, int cmsg_level, int cmsg_type)
13320Sstevel@tonic-gate {
13330Sstevel@tonic-gate 	struct cmsghdr *cmsg;
13340Sstevel@tonic-gate 
13350Sstevel@tonic-gate 	for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL;
13360Sstevel@tonic-gate 	    cmsg = CMSG_NXTHDR(msg, cmsg)) {
13378485SPeter.Memishian@Sun.COM 		if (cmsg->cmsg_level == cmsg_level &&
13380Sstevel@tonic-gate 		    cmsg->cmsg_type == cmsg_type) {
13390Sstevel@tonic-gate 			return (CMSG_DATA(cmsg));
13400Sstevel@tonic-gate 		}
13410Sstevel@tonic-gate 	}
13420Sstevel@tonic-gate 	return (NULL);
13430Sstevel@tonic-gate }
13440Sstevel@tonic-gate 
13450Sstevel@tonic-gate /*
13468485SPeter.Memishian@Sun.COM  * Try to activate another INACTIVE interface in the same group as `pi'.
13478485SPeter.Memishian@Sun.COM  * Prefer STANDBY INACTIVE to just INACTIVE.
13488485SPeter.Memishian@Sun.COM  */
13498485SPeter.Memishian@Sun.COM void
phyint_activate_another(struct phyint * pi)13508485SPeter.Memishian@Sun.COM phyint_activate_another(struct phyint *pi)
13518485SPeter.Memishian@Sun.COM {
13528485SPeter.Memishian@Sun.COM 	struct phyint *pi2;
13538485SPeter.Memishian@Sun.COM 	struct phyint *inactivepi = NULL;
13548485SPeter.Memishian@Sun.COM 
13558485SPeter.Memishian@Sun.COM 	if (pi->pi_group == phyint_anongroup)
13568485SPeter.Memishian@Sun.COM 		return;
13578485SPeter.Memishian@Sun.COM 
13588485SPeter.Memishian@Sun.COM 	for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
1359*10649SPeter.Memishian@Sun.COM 		if (pi == pi2 || !phyint_is_functioning(pi2) ||
13608485SPeter.Memishian@Sun.COM 		    !(pi2->pi_flags & IFF_INACTIVE))
13618485SPeter.Memishian@Sun.COM 			continue;
13628485SPeter.Memishian@Sun.COM 
13638485SPeter.Memishian@Sun.COM 		inactivepi = pi2;
13648485SPeter.Memishian@Sun.COM 		if (pi2->pi_flags & IFF_STANDBY)
13658485SPeter.Memishian@Sun.COM 			break;
13668485SPeter.Memishian@Sun.COM 	}
13678485SPeter.Memishian@Sun.COM 
13688485SPeter.Memishian@Sun.COM 	if (inactivepi != NULL)
13698485SPeter.Memishian@Sun.COM 		(void) change_pif_flags(inactivepi, 0, IFF_INACTIVE);
13708485SPeter.Memishian@Sun.COM }
13718485SPeter.Memishian@Sun.COM 
13728485SPeter.Memishian@Sun.COM /*
137310290SPeter.Memishian@Sun.COM  * Transition a phyint to PI_RUNNING.  The caller must ensure that the
137410290SPeter.Memishian@Sun.COM  * transition is appropriate.  Clears IFF_OFFLINE or IFF_FAILED if
137510290SPeter.Memishian@Sun.COM  * appropriate.  Also sets IFF_INACTIVE on this or other interfaces as
137610290SPeter.Memishian@Sun.COM  * appropriate (see comment below).  Finally, also updates the phyint's group
137710290SPeter.Memishian@Sun.COM  * state to account for the change.
13788485SPeter.Memishian@Sun.COM  */
13798485SPeter.Memishian@Sun.COM void
phyint_transition_to_running(struct phyint * pi)13808485SPeter.Memishian@Sun.COM phyint_transition_to_running(struct phyint *pi)
13818485SPeter.Memishian@Sun.COM {
13828485SPeter.Memishian@Sun.COM 	struct phyint *pi2;
13838485SPeter.Memishian@Sun.COM 	struct phyint *actstandbypi = NULL;
13848485SPeter.Memishian@Sun.COM 	uint_t nactive = 0, nnonstandby = 0;
13858485SPeter.Memishian@Sun.COM 	boolean_t onlining = (pi->pi_state == PI_OFFLINE);
138610290SPeter.Memishian@Sun.COM 	boolean_t initial = (pi->pi_state == PI_INIT);
13878485SPeter.Memishian@Sun.COM 	uint64_t set, clear;
13888485SPeter.Memishian@Sun.COM 
13898485SPeter.Memishian@Sun.COM 	/*
13908485SPeter.Memishian@Sun.COM 	 * The interface is running again, but should it or another interface
13918485SPeter.Memishian@Sun.COM 	 * in the group end up INACTIVE?  There are three cases:
13928485SPeter.Memishian@Sun.COM 	 *
13938485SPeter.Memishian@Sun.COM 	 * 1. If it's a STANDBY interface, it should be end up INACTIVE if
13948485SPeter.Memishian@Sun.COM 	 *    the group is operating at capacity (i.e., there are at least as
13958485SPeter.Memishian@Sun.COM 	 *    many active interfaces as non-STANDBY interfaces in the group).
13968485SPeter.Memishian@Sun.COM 	 *    No other interfaces should be changed.
13978485SPeter.Memishian@Sun.COM 	 *
13988485SPeter.Memishian@Sun.COM 	 * 2. If it's a non-STANDBY interface and we're onlining it or
13998485SPeter.Memishian@Sun.COM 	 *    FAILBACK is enabled, then it should *not* end up INACTIVE.
14008485SPeter.Memishian@Sun.COM 	 *    Further, if the group is above capacity as a result of this
14018485SPeter.Memishian@Sun.COM 	 *    interface, then an active STANDBY interface in the group should
14028485SPeter.Memishian@Sun.COM 	 *    end up INACTIVE.
14038485SPeter.Memishian@Sun.COM 	 *
14048485SPeter.Memishian@Sun.COM 	 * 3. If it's a non-STANDBY interface, we're repairing it, and
14058485SPeter.Memishian@Sun.COM 	 *    FAILBACK is disabled, then it should end up INACTIVE *unless*
14068485SPeter.Memishian@Sun.COM 	 *    the group was failed (in which case we have no choice but to
14078485SPeter.Memishian@Sun.COM 	 *    use it).  No other interfaces should be changed.
14088485SPeter.Memishian@Sun.COM 	 */
14098485SPeter.Memishian@Sun.COM 	if (pi->pi_group != phyint_anongroup) {
14108485SPeter.Memishian@Sun.COM 		pi2 = pi->pi_group->pg_phyint;
14118485SPeter.Memishian@Sun.COM 		for (; pi2 != NULL; pi2 = pi2->pi_pgnext) {
14128485SPeter.Memishian@Sun.COM 			if (!(pi2->pi_flags & IFF_STANDBY))
14138485SPeter.Memishian@Sun.COM 				nnonstandby++;
14148485SPeter.Memishian@Sun.COM 
1415*10649SPeter.Memishian@Sun.COM 			if (phyint_is_functioning(pi2) &&
1416*10649SPeter.Memishian@Sun.COM 			    !(pi2->pi_flags & IFF_INACTIVE)) {
1417*10649SPeter.Memishian@Sun.COM 				nactive++;
1418*10649SPeter.Memishian@Sun.COM 				if (pi2->pi_flags & IFF_STANDBY)
1419*10649SPeter.Memishian@Sun.COM 					actstandbypi = pi2;
14208485SPeter.Memishian@Sun.COM 			}
14218485SPeter.Memishian@Sun.COM 		}
14228485SPeter.Memishian@Sun.COM 	}
14238485SPeter.Memishian@Sun.COM 
14248485SPeter.Memishian@Sun.COM 	set = 0;
14258485SPeter.Memishian@Sun.COM 	clear = (onlining ? IFF_OFFLINE : IFF_FAILED);
14268485SPeter.Memishian@Sun.COM 
14278485SPeter.Memishian@Sun.COM 	if (pi->pi_flags & IFF_STANDBY) {			/* case 1 */
14288485SPeter.Memishian@Sun.COM 		if (nactive >= nnonstandby)
14298485SPeter.Memishian@Sun.COM 			set |= IFF_INACTIVE;
14308485SPeter.Memishian@Sun.COM 		else
14318485SPeter.Memishian@Sun.COM 			clear |= IFF_INACTIVE;
14328485SPeter.Memishian@Sun.COM 	} else if (onlining || failback_enabled) {		/* case 2 */
14338485SPeter.Memishian@Sun.COM 		if (nactive >= nnonstandby && actstandbypi != NULL)
14348485SPeter.Memishian@Sun.COM 			(void) change_pif_flags(actstandbypi, IFF_INACTIVE, 0);
143510290SPeter.Memishian@Sun.COM 	} else if (!initial && !GROUP_FAILED(pi->pi_group)) {	/* case 3 */
14368485SPeter.Memishian@Sun.COM 		set |= IFF_INACTIVE;
14378485SPeter.Memishian@Sun.COM 	}
14388485SPeter.Memishian@Sun.COM 	(void) change_pif_flags(pi, set, clear);
14398485SPeter.Memishian@Sun.COM 
14408485SPeter.Memishian@Sun.COM 	phyint_chstate(pi, PI_RUNNING);
14418485SPeter.Memishian@Sun.COM 
14428485SPeter.Memishian@Sun.COM 	/*
14438485SPeter.Memishian@Sun.COM 	 * Update the group state to account for the change.
14448485SPeter.Memishian@Sun.COM 	 */
14458485SPeter.Memishian@Sun.COM 	phyint_group_refresh_state(pi->pi_group);
14468485SPeter.Memishian@Sun.COM }
14478485SPeter.Memishian@Sun.COM 
14488485SPeter.Memishian@Sun.COM /*
1449*10649SPeter.Memishian@Sun.COM  * Adjust IFF_INACTIVE on the provided `pi' to trend the group configuration
1450*10649SPeter.Memishian@Sun.COM  * to have at least one active interface and as many active interfaces as
1451*10649SPeter.Memishian@Sun.COM  * non-standby interfaces.
1452*10649SPeter.Memishian@Sun.COM  */
1453*10649SPeter.Memishian@Sun.COM void
phyint_standby_refresh_inactive(struct phyint * pi)1454*10649SPeter.Memishian@Sun.COM phyint_standby_refresh_inactive(struct phyint *pi)
1455*10649SPeter.Memishian@Sun.COM {
1456*10649SPeter.Memishian@Sun.COM 	struct phyint *pi2;
1457*10649SPeter.Memishian@Sun.COM 	uint_t nactive = 0, nnonstandby = 0;
1458*10649SPeter.Memishian@Sun.COM 
1459*10649SPeter.Memishian@Sun.COM 	/*
1460*10649SPeter.Memishian@Sun.COM 	 * All phyints in the anonymous group are effectively in their own
1461*10649SPeter.Memishian@Sun.COM 	 * group and thus active regardless of whether they're marked standby.
1462*10649SPeter.Memishian@Sun.COM 	 */
1463*10649SPeter.Memishian@Sun.COM 	if (pi->pi_group == phyint_anongroup) {
1464*10649SPeter.Memishian@Sun.COM 		(void) change_pif_flags(pi, 0, IFF_INACTIVE);
1465*10649SPeter.Memishian@Sun.COM 		return;
1466*10649SPeter.Memishian@Sun.COM 	}
1467*10649SPeter.Memishian@Sun.COM 
1468*10649SPeter.Memishian@Sun.COM 	/*
1469*10649SPeter.Memishian@Sun.COM 	 * If the phyint isn't functioning we can't consider it.
1470*10649SPeter.Memishian@Sun.COM 	 */
1471*10649SPeter.Memishian@Sun.COM 	if (!phyint_is_functioning(pi))
1472*10649SPeter.Memishian@Sun.COM 		return;
1473*10649SPeter.Memishian@Sun.COM 
1474*10649SPeter.Memishian@Sun.COM 	for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
1475*10649SPeter.Memishian@Sun.COM 		if (!(pi2->pi_flags & IFF_STANDBY))
1476*10649SPeter.Memishian@Sun.COM 			nnonstandby++;
1477*10649SPeter.Memishian@Sun.COM 
1478*10649SPeter.Memishian@Sun.COM 		if (phyint_is_functioning(pi2) &&
1479*10649SPeter.Memishian@Sun.COM 		    !(pi2->pi_flags & IFF_INACTIVE))
1480*10649SPeter.Memishian@Sun.COM 			nactive++;
1481*10649SPeter.Memishian@Sun.COM 	}
1482*10649SPeter.Memishian@Sun.COM 
1483*10649SPeter.Memishian@Sun.COM 	if (nactive == 0 || nactive < nnonstandby)
1484*10649SPeter.Memishian@Sun.COM 		(void) change_pif_flags(pi, 0, IFF_INACTIVE);
1485*10649SPeter.Memishian@Sun.COM 	else if (nactive > nnonstandby)
1486*10649SPeter.Memishian@Sun.COM 		(void) change_pif_flags(pi, IFF_INACTIVE, 0);
1487*10649SPeter.Memishian@Sun.COM }
1488*10649SPeter.Memishian@Sun.COM 
1489*10649SPeter.Memishian@Sun.COM /*
14900Sstevel@tonic-gate  * See if a previously failed interface has started working again.
14910Sstevel@tonic-gate  */
14920Sstevel@tonic-gate void
phyint_check_for_repair(struct phyint * pi)14930Sstevel@tonic-gate phyint_check_for_repair(struct phyint *pi)
14940Sstevel@tonic-gate {
14958485SPeter.Memishian@Sun.COM 	if (!phyint_repaired(pi))
14968485SPeter.Memishian@Sun.COM 		return;
14970Sstevel@tonic-gate 
14988485SPeter.Memishian@Sun.COM 	if (pi->pi_group == phyint_anongroup) {
14998485SPeter.Memishian@Sun.COM 		logerr("IP interface repair detected on %s\n", pi->pi_name);
15008485SPeter.Memishian@Sun.COM 	} else {
15018485SPeter.Memishian@Sun.COM 		logerr("IP interface repair detected on %s of group %s\n",
15028485SPeter.Memishian@Sun.COM 		    pi->pi_name, pi->pi_group->pg_name);
15038485SPeter.Memishian@Sun.COM 	}
15040Sstevel@tonic-gate 
15058485SPeter.Memishian@Sun.COM 	/*
15068485SPeter.Memishian@Sun.COM 	 * If the interface is PI_OFFLINE, it can't be made PI_RUNNING yet.
15078485SPeter.Memishian@Sun.COM 	 * So just clear IFF_OFFLINE and defer phyint_transition_to_running()
15088485SPeter.Memishian@Sun.COM 	 * until it is brought back online.
15098485SPeter.Memishian@Sun.COM 	 */
15108485SPeter.Memishian@Sun.COM 	if (pi->pi_state == PI_OFFLINE) {
15118485SPeter.Memishian@Sun.COM 		(void) change_pif_flags(pi, 0, IFF_FAILED);
15128485SPeter.Memishian@Sun.COM 		return;
15138485SPeter.Memishian@Sun.COM 	}
15140Sstevel@tonic-gate 
15158485SPeter.Memishian@Sun.COM 	phyint_transition_to_running(pi);	/* calls phyint_chstate() */
15160Sstevel@tonic-gate }
15170Sstevel@tonic-gate 
15180Sstevel@tonic-gate /*
15198485SPeter.Memishian@Sun.COM  * See if an interface has failed, or if the whole group of interfaces has
15208485SPeter.Memishian@Sun.COM  * failed.
15210Sstevel@tonic-gate  */
15220Sstevel@tonic-gate static void
phyint_inst_check_for_failure(struct phyint_instance * pii)15230Sstevel@tonic-gate phyint_inst_check_for_failure(struct phyint_instance *pii)
15240Sstevel@tonic-gate {
15258485SPeter.Memishian@Sun.COM 	struct phyint	*pi = pii->pii_phyint;
15268485SPeter.Memishian@Sun.COM 	struct phyint	*pi2;
15278485SPeter.Memishian@Sun.COM 	boolean_t	was_active;
15280Sstevel@tonic-gate 
15290Sstevel@tonic-gate 	switch (failure_state(pii)) {
15300Sstevel@tonic-gate 	case PHYINT_FAILURE:
15318485SPeter.Memishian@Sun.COM 		was_active = ((pi->pi_flags & IFF_INACTIVE) == 0);
15328485SPeter.Memishian@Sun.COM 
15338485SPeter.Memishian@Sun.COM 		(void) change_pif_flags(pi, IFF_FAILED, IFF_INACTIVE);
15340Sstevel@tonic-gate 		if (pi->pi_group == phyint_anongroup) {
15358485SPeter.Memishian@Sun.COM 			logerr("IP interface failure detected on %s\n",
15368485SPeter.Memishian@Sun.COM 			    pii->pii_name);
15370Sstevel@tonic-gate 		} else {
15388485SPeter.Memishian@Sun.COM 			logerr("IP interface failure detected on %s of group"
15398485SPeter.Memishian@Sun.COM 			    " %s\n", pii->pii_name, pi->pi_group->pg_name);
15400Sstevel@tonic-gate 		}
15418485SPeter.Memishian@Sun.COM 
15420Sstevel@tonic-gate 		/*
15438700SPeter.Memishian@Sun.COM 		 * If the failed interface was active, activate another
15448700SPeter.Memishian@Sun.COM 		 * INACTIVE interface in the group if possible.
15458700SPeter.Memishian@Sun.COM 		 */
15468700SPeter.Memishian@Sun.COM 		if (was_active)
15478700SPeter.Memishian@Sun.COM 			phyint_activate_another(pi);
15488700SPeter.Memishian@Sun.COM 
15498700SPeter.Memishian@Sun.COM 		/*
15508485SPeter.Memishian@Sun.COM 		 * If the interface is offline, the state change will be
15518485SPeter.Memishian@Sun.COM 		 * noted when it comes back online.
15520Sstevel@tonic-gate 		 */
15530Sstevel@tonic-gate 		if (pi->pi_state != PI_OFFLINE) {
15540Sstevel@tonic-gate 			phyint_chstate(pi, PI_FAILED);
15550Sstevel@tonic-gate 			reset_crtt_all(pi);
15560Sstevel@tonic-gate 		}
15570Sstevel@tonic-gate 		break;
15580Sstevel@tonic-gate 
15590Sstevel@tonic-gate 	case GROUP_FAILURE:
15608485SPeter.Memishian@Sun.COM 		pi2 = pi->pi_group->pg_phyint;
15618485SPeter.Memishian@Sun.COM 		for (; pi2 != NULL; pi2 = pi2->pi_pgnext) {
15628485SPeter.Memishian@Sun.COM 			(void) change_pif_flags(pi2, IFF_FAILED, IFF_INACTIVE);
15638485SPeter.Memishian@Sun.COM 			if (pi2->pi_state == PI_OFFLINE) /* see comment above */
15640Sstevel@tonic-gate 				continue;
15658485SPeter.Memishian@Sun.COM 
15660Sstevel@tonic-gate 			reset_crtt_all(pi2);
15670Sstevel@tonic-gate 			/*
15688485SPeter.Memishian@Sun.COM 			 * In the case of host targets, we would have flushed
15698485SPeter.Memishian@Sun.COM 			 * the targets, and gone to PI_NOTARGETS state.
15700Sstevel@tonic-gate 			 */
15710Sstevel@tonic-gate 			if (pi2->pi_state == PI_RUNNING)
1572704Sethindra 				phyint_chstate(pi2, PI_FAILED);
15730Sstevel@tonic-gate 		}
15740Sstevel@tonic-gate 		break;
15750Sstevel@tonic-gate 
15760Sstevel@tonic-gate 	default:
15770Sstevel@tonic-gate 		break;
15780Sstevel@tonic-gate 	}
15790Sstevel@tonic-gate }
15800Sstevel@tonic-gate 
15810Sstevel@tonic-gate /*
15820Sstevel@tonic-gate  * Determines if any timeout event has occurred and returns the number of
15830Sstevel@tonic-gate  * milliseconds until the next timeout event for the phyint. Returns
15840Sstevel@tonic-gate  * TIMER_INFINITY for "never".
15850Sstevel@tonic-gate  */
15860Sstevel@tonic-gate uint_t
phyint_inst_timer(struct phyint_instance * pii)15870Sstevel@tonic-gate phyint_inst_timer(struct phyint_instance *pii)
15880Sstevel@tonic-gate {
15890Sstevel@tonic-gate 	int 	pr_ndx;
15900Sstevel@tonic-gate 	uint_t	timeout;
15910Sstevel@tonic-gate 	struct	target	*cur_tg;
15920Sstevel@tonic-gate 	struct	probe_stats *pr_statp;
15930Sstevel@tonic-gate 	struct	phyint_instance *pii_other;
15940Sstevel@tonic-gate 	struct	phyint *pi;
15950Sstevel@tonic-gate 	int	valid_unack_count;
15960Sstevel@tonic-gate 	int	i;
15970Sstevel@tonic-gate 	int	interval;
15980Sstevel@tonic-gate 	uint_t	check_time;
15990Sstevel@tonic-gate 	uint_t	cur_time;
16000Sstevel@tonic-gate 	hrtime_t cur_hrtime;
16010Sstevel@tonic-gate 	int	probe_interval = pii->pii_phyint->pi_group->pg_probeint;
16020Sstevel@tonic-gate 
16038485SPeter.Memishian@Sun.COM 	cur_hrtime = gethrtime();
16048485SPeter.Memishian@Sun.COM 	cur_time = ns2ms(cur_hrtime);
16050Sstevel@tonic-gate 
16060Sstevel@tonic-gate 	if (debug & D_TIMER) {
16070Sstevel@tonic-gate 		logdebug("phyint_inst_timer(%s %s)\n",
16080Sstevel@tonic-gate 		    AF_STR(pii->pii_af), pii->pii_name);
16090Sstevel@tonic-gate 	}
16100Sstevel@tonic-gate 
16110Sstevel@tonic-gate 	pii_other = phyint_inst_other(pii);
16120Sstevel@tonic-gate 	if (!PROBE_ENABLED(pii) && !PROBE_ENABLED(pii_other)) {
16130Sstevel@tonic-gate 		/*
16140Sstevel@tonic-gate 		 * Check to see if we're here due to link up/down flapping; If
16150Sstevel@tonic-gate 		 * enough time has passed, then try to bring the interface
16160Sstevel@tonic-gate 		 * back up; otherwise, schedule a timer to bring it back up
16170Sstevel@tonic-gate 		 * when enough time *has* elapsed.
16180Sstevel@tonic-gate 		 */
16190Sstevel@tonic-gate 		pi = pii->pii_phyint;
16200Sstevel@tonic-gate 		if (pi->pi_state == PI_FAILED && LINK_UP(pi)) {
16210Sstevel@tonic-gate 			check_time = pi->pi_whenup[pi->pi_whendx] + MSEC_PERMIN;
16220Sstevel@tonic-gate 			if (check_time > cur_time)
16230Sstevel@tonic-gate 				return (check_time - cur_time);
16240Sstevel@tonic-gate 
16250Sstevel@tonic-gate 			phyint_check_for_repair(pi);
16260Sstevel@tonic-gate 		}
16270Sstevel@tonic-gate 	}
16280Sstevel@tonic-gate 
16290Sstevel@tonic-gate 	/*
16302496Smeem 	 * If probing is not enabled on this phyint instance, don't proceed.
16310Sstevel@tonic-gate 	 */
16322496Smeem 	if (!PROBE_ENABLED(pii))
16330Sstevel@tonic-gate 		return (TIMER_INFINITY);
16340Sstevel@tonic-gate 
16350Sstevel@tonic-gate 	/*
16360Sstevel@tonic-gate 	 * If the timer has fired too soon, probably triggered
16370Sstevel@tonic-gate 	 * by some other phyint instance, return the remaining
16380Sstevel@tonic-gate 	 * time
16390Sstevel@tonic-gate 	 */
16400Sstevel@tonic-gate 	if (TIME_LT(cur_time, pii->pii_snxt_time))
16410Sstevel@tonic-gate 		return (pii->pii_snxt_time - cur_time);
16420Sstevel@tonic-gate 
16430Sstevel@tonic-gate 	/*
16440Sstevel@tonic-gate 	 * If the link is down, don't send any probes for now.
16450Sstevel@tonic-gate 	 */
16460Sstevel@tonic-gate 	if (LINK_DOWN(pii->pii_phyint))
16470Sstevel@tonic-gate 		return (TIMER_INFINITY);
16480Sstevel@tonic-gate 
16490Sstevel@tonic-gate 	/*
16500Sstevel@tonic-gate 	 * Randomize the next probe time, between MIN_RANDOM_FACTOR
16510Sstevel@tonic-gate 	 * and MAX_RANDOM_FACTOR with respect to the base probe time.
16520Sstevel@tonic-gate 	 * Base probe time is strictly periodic.
16530Sstevel@tonic-gate 	 */
16540Sstevel@tonic-gate 	interval = GET_RANDOM(
16550Sstevel@tonic-gate 	    (int)(MIN_RANDOM_FACTOR * user_probe_interval),
16560Sstevel@tonic-gate 	    (int)(MAX_RANDOM_FACTOR * user_probe_interval));
16570Sstevel@tonic-gate 	pii->pii_snxt_time = pii->pii_snxt_basetime + interval;
16580Sstevel@tonic-gate 
16590Sstevel@tonic-gate 	/*
16600Sstevel@tonic-gate 	 * Check if the current time > next time to probe. If so, we missed
16610Sstevel@tonic-gate 	 * sending 1 or more probes, probably due to heavy system load. At least
16620Sstevel@tonic-gate 	 * 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we
16630Sstevel@tonic-gate 	 * were scheduled. Make adjustments to the times, in multiples of
16640Sstevel@tonic-gate 	 * user_probe_interval.
16650Sstevel@tonic-gate 	 */
16660Sstevel@tonic-gate 	if (TIME_GT(cur_time, pii->pii_snxt_time)) {
16670Sstevel@tonic-gate 		int n;
16680Sstevel@tonic-gate 
16690Sstevel@tonic-gate 		n = (cur_time - pii->pii_snxt_time) / user_probe_interval;
16700Sstevel@tonic-gate 		pii->pii_snxt_time 	+= (n + 1) * user_probe_interval;
16710Sstevel@tonic-gate 		pii->pii_snxt_basetime 	+= (n + 1) * user_probe_interval;
16720Sstevel@tonic-gate 		logtrace("missed sending %d probes cur_time %u snxt_time %u"
16730Sstevel@tonic-gate 		    " snxt_basetime %u\n", n + 1, cur_time, pii->pii_snxt_time,
16740Sstevel@tonic-gate 		    pii->pii_snxt_basetime);
16750Sstevel@tonic-gate 
16760Sstevel@tonic-gate 		/* Collect statistics about missed probes */
16770Sstevel@tonic-gate 		probes_missed.pm_nprobes += n + 1;
16780Sstevel@tonic-gate 		probes_missed.pm_ntimes++;
16790Sstevel@tonic-gate 	}
16800Sstevel@tonic-gate 	pii->pii_snxt_basetime += user_probe_interval;
16810Sstevel@tonic-gate 	interval = pii->pii_snxt_time - cur_time;
16820Sstevel@tonic-gate 	if (debug & D_TARGET) {
16830Sstevel@tonic-gate 		logdebug("cur_time %u snxt_time %u snxt_basetime %u"
16840Sstevel@tonic-gate 		    " interval %u\n", cur_time, pii->pii_snxt_time,
16850Sstevel@tonic-gate 		    pii->pii_snxt_basetime, interval);
16860Sstevel@tonic-gate 	}
16870Sstevel@tonic-gate 
16880Sstevel@tonic-gate 	/*
16890Sstevel@tonic-gate 	 * If no targets are known, we need to send an ICMP multicast. The
16900Sstevel@tonic-gate 	 * probe type is PROBE_MULTI.  We'll check back in 'interval' msec
16910Sstevel@tonic-gate 	 * to see if we found a target.
16920Sstevel@tonic-gate 	 */
16930Sstevel@tonic-gate 	if (pii->pii_target_next == NULL) {
16940Sstevel@tonic-gate 		assert(pii->pii_ntargets == 0);
16950Sstevel@tonic-gate 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
16960Sstevel@tonic-gate 		probe(pii, PROBE_MULTI, cur_time);
16970Sstevel@tonic-gate 		return (interval);
16980Sstevel@tonic-gate 	}
16990Sstevel@tonic-gate 
17000Sstevel@tonic-gate 	if ((user_probe_interval != probe_interval) &&
17010Sstevel@tonic-gate 	    TIME_LT(pii->pii_snxt_time, pii->pii_fd_snxt_basetime)) {
17020Sstevel@tonic-gate 		/*
17030Sstevel@tonic-gate 		 * the failure detection (fd) probe timer has not yet fired.
17040Sstevel@tonic-gate 		 * Need to send only an rtt probe. The probe type is PROBE_RTT.
17050Sstevel@tonic-gate 		 */
17068485SPeter.Memishian@Sun.COM 		probe(pii, PROBE_RTT, cur_hrtime);
17070Sstevel@tonic-gate 		return (interval);
17080Sstevel@tonic-gate 	}
17090Sstevel@tonic-gate 	/*
17100Sstevel@tonic-gate 	 * the fd probe timer has fired. Need to do all failure
17110Sstevel@tonic-gate 	 * detection / recovery calculations, and then send an fd probe
17120Sstevel@tonic-gate 	 * of type PROBE_UNI.
17130Sstevel@tonic-gate 	 */
17140Sstevel@tonic-gate 	if (user_probe_interval == probe_interval) {
17150Sstevel@tonic-gate 		/*
17160Sstevel@tonic-gate 		 * We could have missed some probes, and then adjusted
17170Sstevel@tonic-gate 		 * pii_snxt_basetime above. Otherwise we could have
17180Sstevel@tonic-gate 		 * blindly added probe_interval to pii_fd_snxt_basetime.
17190Sstevel@tonic-gate 		 */
17200Sstevel@tonic-gate 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
17210Sstevel@tonic-gate 	} else {
17220Sstevel@tonic-gate 		pii->pii_fd_snxt_basetime += probe_interval;
17230Sstevel@tonic-gate 		if (TIME_GT(cur_time, pii->pii_fd_snxt_basetime)) {
17240Sstevel@tonic-gate 			int n;
17250Sstevel@tonic-gate 
17260Sstevel@tonic-gate 			n = (cur_time - pii->pii_fd_snxt_basetime) /
17270Sstevel@tonic-gate 			    probe_interval;
17280Sstevel@tonic-gate 			pii->pii_fd_snxt_basetime += (n + 1) * probe_interval;
17290Sstevel@tonic-gate 		}
17300Sstevel@tonic-gate 	}
17310Sstevel@tonic-gate 
17320Sstevel@tonic-gate 	/*
17330Sstevel@tonic-gate 	 * We can have at most, the latest 2 probes that we sent, in
17340Sstevel@tonic-gate 	 * the PR_UNACKED state. All previous probes sent, are either
17350Sstevel@tonic-gate 	 * PR_LOST or PR_ACKED. An unacknowledged probe is considered
17368485SPeter.Memishian@Sun.COM 	 * timed out if the probe's time_start + the CRTT < currenttime.
17370Sstevel@tonic-gate 	 * For each of the last 2 probes, examine whether it has timed
17380Sstevel@tonic-gate 	 * out. If so, mark it PR_LOST. The probe stats is a circular array.
17390Sstevel@tonic-gate 	 */
17400Sstevel@tonic-gate 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
17410Sstevel@tonic-gate 	valid_unack_count = 0;
17420Sstevel@tonic-gate 
17430Sstevel@tonic-gate 	for (i = 0; i < 2; i++) {
17440Sstevel@tonic-gate 		pr_statp = &pii->pii_probes[pr_ndx];
17450Sstevel@tonic-gate 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
17460Sstevel@tonic-gate 		switch (pr_statp->pr_status) {
17470Sstevel@tonic-gate 		case PR_ACKED:
17480Sstevel@tonic-gate 			/*
17490Sstevel@tonic-gate 			 * We received back an ACK, so the switch clearly
17500Sstevel@tonic-gate 			 * is not dropping our traffic, and thus we can
17510Sstevel@tonic-gate 			 * enable failure detection immediately.
17520Sstevel@tonic-gate 			 */
17530Sstevel@tonic-gate 			if (pii->pii_fd_hrtime > gethrtime()) {
17540Sstevel@tonic-gate 				if (debug & D_PROBE) {
17550Sstevel@tonic-gate 					logdebug("successful probe on %s; "
17560Sstevel@tonic-gate 					    "ending quiet period\n",
17570Sstevel@tonic-gate 					    pii->pii_phyint->pi_name);
17580Sstevel@tonic-gate 				}
17590Sstevel@tonic-gate 				pii->pii_fd_hrtime = gethrtime();
17600Sstevel@tonic-gate 			}
17610Sstevel@tonic-gate 			break;
17620Sstevel@tonic-gate 
17630Sstevel@tonic-gate 		case PR_UNACKED:
17640Sstevel@tonic-gate 			assert(cur_tg != NULL);
17650Sstevel@tonic-gate 			/*
17660Sstevel@tonic-gate 			 * The crtt could be zero for some reason,
17670Sstevel@tonic-gate 			 * Eg. the phyint could be failed. If the crtt is
17680Sstevel@tonic-gate 			 * not available use group's probe interval,
17690Sstevel@tonic-gate 			 * which is a worst case estimate.
17700Sstevel@tonic-gate 			 */
17718485SPeter.Memishian@Sun.COM 			timeout = ns2ms(pr_statp->pr_hrtime_start);
17720Sstevel@tonic-gate 			if (cur_tg->tg_crtt != 0) {
17738485SPeter.Memishian@Sun.COM 				timeout += cur_tg->tg_crtt;
17740Sstevel@tonic-gate 			} else {
17758485SPeter.Memishian@Sun.COM 				timeout += probe_interval;
17760Sstevel@tonic-gate 			}
17770Sstevel@tonic-gate 			if (TIME_LT(timeout, cur_time)) {
17780Sstevel@tonic-gate 				pr_statp->pr_time_lost = timeout;
17798485SPeter.Memishian@Sun.COM 				probe_chstate(pr_statp, pii, PR_LOST);
17800Sstevel@tonic-gate 			} else if (i == 1) {
17810Sstevel@tonic-gate 				/*
17820Sstevel@tonic-gate 				 * We are forced to consider this probe
17830Sstevel@tonic-gate 				 * lost, as we can have at most 2 unack.
17840Sstevel@tonic-gate 				 * probes any time, and we will be sending a
17850Sstevel@tonic-gate 				 * probe at the end of this function.
17860Sstevel@tonic-gate 				 * Normally, we should not be here, but
17870Sstevel@tonic-gate 				 * this can happen if an incoming response
17880Sstevel@tonic-gate 				 * that was considered lost has increased
17890Sstevel@tonic-gate 				 * the crtt for this target, and also bumped
17900Sstevel@tonic-gate 				 * up the FDT. Note that we never cancel or
17910Sstevel@tonic-gate 				 * increase the current pii_time_left, so
17920Sstevel@tonic-gate 				 * when the timer fires, we find 2 valid
17930Sstevel@tonic-gate 				 * unacked probes, and they are yet to timeout
17940Sstevel@tonic-gate 				 */
17950Sstevel@tonic-gate 				pr_statp->pr_time_lost = cur_time;
17968485SPeter.Memishian@Sun.COM 				probe_chstate(pr_statp, pii, PR_LOST);
17970Sstevel@tonic-gate 			} else {
17980Sstevel@tonic-gate 				/*
17990Sstevel@tonic-gate 				 * Only the most recent probe can enter
18000Sstevel@tonic-gate 				 * this 'else' arm. The second most recent
18010Sstevel@tonic-gate 				 * probe must take either of the above arms,
18020Sstevel@tonic-gate 				 * if it is unacked.
18030Sstevel@tonic-gate 				 */
18040Sstevel@tonic-gate 				valid_unack_count++;
18050Sstevel@tonic-gate 			}
18060Sstevel@tonic-gate 			break;
18070Sstevel@tonic-gate 		}
18080Sstevel@tonic-gate 		pr_ndx = PROBE_INDEX_PREV(pr_ndx);
18090Sstevel@tonic-gate 	}
18100Sstevel@tonic-gate 
18110Sstevel@tonic-gate 	/*
18120Sstevel@tonic-gate 	 * We send out 1 probe randomly in the interval between one half
18130Sstevel@tonic-gate 	 * and one probe interval for the group. Given that the CRTT is always
18140Sstevel@tonic-gate 	 * less than the group's probe interval, we can have at most 1
18150Sstevel@tonic-gate 	 * unacknowledged probe now.  All previous probes are either lost or
18160Sstevel@tonic-gate 	 * acked.
18170Sstevel@tonic-gate 	 */
18180Sstevel@tonic-gate 	assert(valid_unack_count == 0 || valid_unack_count == 1);
18190Sstevel@tonic-gate 
18200Sstevel@tonic-gate 	/*
18210Sstevel@tonic-gate 	 * The timer has fired. Take appropriate action depending
18220Sstevel@tonic-gate 	 * on the current state of the phyint.
18230Sstevel@tonic-gate 	 *
18248485SPeter.Memishian@Sun.COM 	 * PI_RUNNING state 	- Failure detection
18258485SPeter.Memishian@Sun.COM 	 * PI_FAILED state 	- Repair detection
18260Sstevel@tonic-gate 	 */
18270Sstevel@tonic-gate 	switch (pii->pii_phyint->pi_state) {
18280Sstevel@tonic-gate 	case PI_FAILED:
18290Sstevel@tonic-gate 		/*
18300Sstevel@tonic-gate 		 * If the most recent probe (excluding unacked probes that
18310Sstevel@tonic-gate 		 * are yet to time out) has been acked, check whether the
18328485SPeter.Memishian@Sun.COM 		 * phyint is now repaired.
18330Sstevel@tonic-gate 		 */
18340Sstevel@tonic-gate 		if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) {
18350Sstevel@tonic-gate 			phyint_check_for_repair(pii->pii_phyint);
18360Sstevel@tonic-gate 		}
18370Sstevel@tonic-gate 		break;
18380Sstevel@tonic-gate 
18390Sstevel@tonic-gate 	case PI_RUNNING:
18400Sstevel@tonic-gate 		/*
18410Sstevel@tonic-gate 		 * It's possible our probes have been lost because of a
18420Sstevel@tonic-gate 		 * spanning-tree mandated quiet period on the switch.  If so,
18438485SPeter.Memishian@Sun.COM 		 * ignore the lost probes.
18440Sstevel@tonic-gate 		 */
18450Sstevel@tonic-gate 		if (pii->pii_fd_hrtime - cur_hrtime > 0)
18460Sstevel@tonic-gate 			break;
18470Sstevel@tonic-gate 
18480Sstevel@tonic-gate 		if (pii->pii_rack + valid_unack_count + 1 != pii->pii_snxt) {
18490Sstevel@tonic-gate 			/*
18500Sstevel@tonic-gate 			 * We have 1 or more failed probes (excluding unacked
18510Sstevel@tonic-gate 			 * probes that are yet to time out). Determine if the
18528485SPeter.Memishian@Sun.COM 			 * phyint has failed.
18530Sstevel@tonic-gate 			 */
18540Sstevel@tonic-gate 			phyint_inst_check_for_failure(pii);
18550Sstevel@tonic-gate 		}
18560Sstevel@tonic-gate 		break;
18570Sstevel@tonic-gate 
18580Sstevel@tonic-gate 	default:
18590Sstevel@tonic-gate 		logerr("phyint_inst_timer: invalid state %d\n",
18600Sstevel@tonic-gate 		    pii->pii_phyint->pi_state);
18610Sstevel@tonic-gate 		abort();
18620Sstevel@tonic-gate 	}
18630Sstevel@tonic-gate 
18640Sstevel@tonic-gate 	/*
18650Sstevel@tonic-gate 	 * Start the next probe. probe() will also set pii->pii_probe_time_left
18660Sstevel@tonic-gate 	 * to the group's probe interval. If phyint_failed -> target_flush_hosts
18670Sstevel@tonic-gate 	 * was called, the target list may be empty.
18680Sstevel@tonic-gate 	 */
18690Sstevel@tonic-gate 	if (pii->pii_target_next != NULL) {
18708485SPeter.Memishian@Sun.COM 		probe(pii, PROBE_UNI, cur_hrtime);
18710Sstevel@tonic-gate 		/*
18720Sstevel@tonic-gate 		 * If we have just the one probe target, and we're not using
18730Sstevel@tonic-gate 		 * router targets, try to find another as we presently have
18740Sstevel@tonic-gate 		 * no resilience.
18750Sstevel@tonic-gate 		 */
18760Sstevel@tonic-gate 		if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1)
18778485SPeter.Memishian@Sun.COM 			probe(pii, PROBE_MULTI, cur_hrtime);
18780Sstevel@tonic-gate 	} else {
18798485SPeter.Memishian@Sun.COM 		probe(pii, PROBE_MULTI, cur_hrtime);
18800Sstevel@tonic-gate 	}
18810Sstevel@tonic-gate 	return (interval);
18820Sstevel@tonic-gate }
18830Sstevel@tonic-gate 
18840Sstevel@tonic-gate /*
18850Sstevel@tonic-gate  * Start the probe timer for an interface instance.
18860Sstevel@tonic-gate  */
18870Sstevel@tonic-gate void
start_timer(struct phyint_instance * pii)18880Sstevel@tonic-gate start_timer(struct phyint_instance *pii)
18890Sstevel@tonic-gate {
18900Sstevel@tonic-gate 	uint32_t interval;
18910Sstevel@tonic-gate 
18920Sstevel@tonic-gate 	/*
18930Sstevel@tonic-gate 	 * Spread the base probe times (pi_snxt_basetime) across phyints
18940Sstevel@tonic-gate 	 * uniformly over the (curtime..curtime + the group's probe_interval).
18950Sstevel@tonic-gate 	 * pi_snxt_basetime is strictly periodic with a frequency of
18960Sstevel@tonic-gate 	 * the group's probe interval. The actual probe time pi_snxt_time
18970Sstevel@tonic-gate 	 * adds some randomness to pi_snxt_basetime and happens in probe().
18980Sstevel@tonic-gate 	 * For the 1st probe on each phyint after the timer is started,
18990Sstevel@tonic-gate 	 * pi_snxt_time and pi_snxt_basetime are the same.
19000Sstevel@tonic-gate 	 */
19010Sstevel@tonic-gate 	interval = GET_RANDOM(0,
19020Sstevel@tonic-gate 	    (int)pii->pii_phyint->pi_group->pg_probeint);
19030Sstevel@tonic-gate 
19040Sstevel@tonic-gate 	pii->pii_snxt_basetime = getcurrenttime() + interval;
19050Sstevel@tonic-gate 	pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
19060Sstevel@tonic-gate 	pii->pii_snxt_time = pii->pii_snxt_basetime;
19070Sstevel@tonic-gate 	timer_schedule(interval);
19080Sstevel@tonic-gate }
19090Sstevel@tonic-gate 
19100Sstevel@tonic-gate /*
19110Sstevel@tonic-gate  * Restart the probe timer on an interface instance.
19120Sstevel@tonic-gate  */
19130Sstevel@tonic-gate static void
restart_timer(struct phyint_instance * pii)19140Sstevel@tonic-gate restart_timer(struct phyint_instance *pii)
19150Sstevel@tonic-gate {
19160Sstevel@tonic-gate 	/*
19170Sstevel@tonic-gate 	 * We don't need to restart the timer if it was never started in
19180Sstevel@tonic-gate 	 * the first place (pii->pii_basetime_inited not set), as the timer
19190Sstevel@tonic-gate 	 * won't have gone off yet.
19200Sstevel@tonic-gate 	 */
19210Sstevel@tonic-gate 	if (pii->pii_basetime_inited != 0) {
19220Sstevel@tonic-gate 
19230Sstevel@tonic-gate 		if (debug & D_LINKNOTE)
19240Sstevel@tonic-gate 			logdebug("restart timer: restarting timer on %s, "
19250Sstevel@tonic-gate 			    "address family %s\n", pii->pii_phyint->pi_name,
19260Sstevel@tonic-gate 			    AF_STR(pii->pii_af));
19270Sstevel@tonic-gate 
19280Sstevel@tonic-gate 		start_timer(pii);
19290Sstevel@tonic-gate 	}
19300Sstevel@tonic-gate }
19310Sstevel@tonic-gate 
19320Sstevel@tonic-gate static void
process_link_state_down(struct phyint * pi)19330Sstevel@tonic-gate process_link_state_down(struct phyint *pi)
19340Sstevel@tonic-gate {
19350Sstevel@tonic-gate 	logerr("The link has gone down on %s\n", pi->pi_name);
19360Sstevel@tonic-gate 
19370Sstevel@tonic-gate 	/*
19380Sstevel@tonic-gate 	 * Clear the probe statistics arrays, we don't want the repair
19398485SPeter.Memishian@Sun.COM 	 * detection logic relying on probes that were successful prior
19408485SPeter.Memishian@Sun.COM 	 * to the link going down.
19410Sstevel@tonic-gate 	 */
19420Sstevel@tonic-gate 	if (PROBE_CAPABLE(pi->pi_v4))
19430Sstevel@tonic-gate 		clear_pii_probe_stats(pi->pi_v4);
19440Sstevel@tonic-gate 	if (PROBE_CAPABLE(pi->pi_v6))
19450Sstevel@tonic-gate 		clear_pii_probe_stats(pi->pi_v6);
19460Sstevel@tonic-gate 	/*
19470Sstevel@tonic-gate 	 * Check for interface failure.  Although we know the interface
19480Sstevel@tonic-gate 	 * has failed, we don't know if all the other interfaces in the
19490Sstevel@tonic-gate 	 * group have failed as well.
19500Sstevel@tonic-gate 	 */
19510Sstevel@tonic-gate 	if ((pi->pi_state == PI_RUNNING) ||
19520Sstevel@tonic-gate 	    (pi->pi_state != PI_FAILED && !GROUP_FAILED(pi->pi_group))) {
19530Sstevel@tonic-gate 		if (debug & D_LINKNOTE) {
19540Sstevel@tonic-gate 			logdebug("process_link_state_down:"
19550Sstevel@tonic-gate 			    " checking for failure on %s\n", pi->pi_name);
19560Sstevel@tonic-gate 		}
19570Sstevel@tonic-gate 
19580Sstevel@tonic-gate 		if (pi->pi_v4 != NULL)
19590Sstevel@tonic-gate 			phyint_inst_check_for_failure(pi->pi_v4);
19600Sstevel@tonic-gate 		else if (pi->pi_v6 != NULL)
19610Sstevel@tonic-gate 			phyint_inst_check_for_failure(pi->pi_v6);
19620Sstevel@tonic-gate 	}
19630Sstevel@tonic-gate }
19640Sstevel@tonic-gate 
19650Sstevel@tonic-gate static void
process_link_state_up(struct phyint * pi)19660Sstevel@tonic-gate process_link_state_up(struct phyint *pi)
19670Sstevel@tonic-gate {
19680Sstevel@tonic-gate 	logerr("The link has come up on %s\n", pi->pi_name);
19690Sstevel@tonic-gate 
19700Sstevel@tonic-gate 	/*
19710Sstevel@tonic-gate 	 * We stopped any running timers on each instance when the link
19720Sstevel@tonic-gate 	 * went down, so restart them.
19730Sstevel@tonic-gate 	 */
19740Sstevel@tonic-gate 	if (pi->pi_v4)
19750Sstevel@tonic-gate 		restart_timer(pi->pi_v4);
19760Sstevel@tonic-gate 	if (pi->pi_v6)
19770Sstevel@tonic-gate 		restart_timer(pi->pi_v6);
19780Sstevel@tonic-gate 
19790Sstevel@tonic-gate 	phyint_check_for_repair(pi);
19800Sstevel@tonic-gate 
19810Sstevel@tonic-gate 	pi->pi_whenup[pi->pi_whendx++] = getcurrenttime();
19820Sstevel@tonic-gate 	if (pi->pi_whendx == LINK_UP_PERMIN)
19830Sstevel@tonic-gate 		pi->pi_whendx = 0;
19840Sstevel@tonic-gate }
19850Sstevel@tonic-gate 
19860Sstevel@tonic-gate /*
19870Sstevel@tonic-gate  * Process any changes in link state passed up from the interfaces.
19880Sstevel@tonic-gate  */
19890Sstevel@tonic-gate void
process_link_state_changes(void)19900Sstevel@tonic-gate process_link_state_changes(void)
19910Sstevel@tonic-gate {
19920Sstevel@tonic-gate 	struct phyint *pi;
19930Sstevel@tonic-gate 
19940Sstevel@tonic-gate 	/* Look for interfaces where the link state has just changed */
19950Sstevel@tonic-gate 
19960Sstevel@tonic-gate 	for (pi = phyints; pi != NULL; pi = pi->pi_next) {
19970Sstevel@tonic-gate 		boolean_t old_link_state_up = LINK_UP(pi);
19980Sstevel@tonic-gate 
19990Sstevel@tonic-gate 		/*
20000Sstevel@tonic-gate 		 * Except when the "phyint" structure is created, this is
20010Sstevel@tonic-gate 		 * the only place the link state is updated.  This allows
20020Sstevel@tonic-gate 		 * this routine to detect changes in link state, rather
20030Sstevel@tonic-gate 		 * than just the current state.
20040Sstevel@tonic-gate 		 */
20050Sstevel@tonic-gate 		UPDATE_LINK_STATE(pi);
20060Sstevel@tonic-gate 
20070Sstevel@tonic-gate 		if (LINK_DOWN(pi)) {
20080Sstevel@tonic-gate 			/*
20090Sstevel@tonic-gate 			 * Has link just gone down?
20100Sstevel@tonic-gate 			 */
20110Sstevel@tonic-gate 			if (old_link_state_up)
20120Sstevel@tonic-gate 				process_link_state_down(pi);
20130Sstevel@tonic-gate 		} else {
20140Sstevel@tonic-gate 			/*
20150Sstevel@tonic-gate 			 * Has link just gone back up?
20160Sstevel@tonic-gate 			 */
20170Sstevel@tonic-gate 			if (!old_link_state_up)
20180Sstevel@tonic-gate 				process_link_state_up(pi);
20190Sstevel@tonic-gate 		}
20200Sstevel@tonic-gate 	}
20210Sstevel@tonic-gate }
20220Sstevel@tonic-gate 
20230Sstevel@tonic-gate void
reset_crtt_all(struct phyint * pi)20240Sstevel@tonic-gate reset_crtt_all(struct phyint *pi)
20250Sstevel@tonic-gate {
20260Sstevel@tonic-gate 	struct phyint_instance *pii;
20270Sstevel@tonic-gate 	struct target *tg;
20280Sstevel@tonic-gate 
20290Sstevel@tonic-gate 	pii = pi->pi_v4;
20300Sstevel@tonic-gate 	if (pii != NULL) {
20310Sstevel@tonic-gate 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
20320Sstevel@tonic-gate 			tg->tg_crtt = 0;
20330Sstevel@tonic-gate 			tg->tg_rtt_sa = -1;
20340Sstevel@tonic-gate 			tg->tg_rtt_sd = 0;
20350Sstevel@tonic-gate 		}
20360Sstevel@tonic-gate 	}
20370Sstevel@tonic-gate 
20380Sstevel@tonic-gate 	pii = pi->pi_v6;
20390Sstevel@tonic-gate 	if (pii != NULL) {
20400Sstevel@tonic-gate 		for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) {
20410Sstevel@tonic-gate 			tg->tg_crtt = 0;
20420Sstevel@tonic-gate 			tg->tg_rtt_sa = -1;
20430Sstevel@tonic-gate 			tg->tg_rtt_sd = 0;
20440Sstevel@tonic-gate 		}
20450Sstevel@tonic-gate 	}
20460Sstevel@tonic-gate }
20470Sstevel@tonic-gate 
20480Sstevel@tonic-gate /*
20490Sstevel@tonic-gate  * Check if the phyint has failed the last NUM_PROBE_FAILS consecutive
20500Sstevel@tonic-gate  * probes on both instances IPv4 and IPv6.
20510Sstevel@tonic-gate  * If the interface has failed, return the time of the first probe failure
20520Sstevel@tonic-gate  * in "tff".
20530Sstevel@tonic-gate  */
20540Sstevel@tonic-gate static int
phyint_inst_probe_failure_state(struct phyint_instance * pii,uint_t * tff)20550Sstevel@tonic-gate phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff)
20560Sstevel@tonic-gate {
20570Sstevel@tonic-gate 	uint_t	pi_tff;
20580Sstevel@tonic-gate 	struct	target *cur_tg;
20590Sstevel@tonic-gate 	struct	probe_fail_count pfinfo;
20600Sstevel@tonic-gate 	struct	phyint_instance *pii_other;
20610Sstevel@tonic-gate 	int	pr_ndx;
20620Sstevel@tonic-gate 
20630Sstevel@tonic-gate 	/*
20640Sstevel@tonic-gate 	 * Get the number of consecutive failed probes on
20650Sstevel@tonic-gate 	 * this phyint across all targets. Also get the number
20660Sstevel@tonic-gate 	 * of consecutive failed probes on this target only
20670Sstevel@tonic-gate 	 */
20680Sstevel@tonic-gate 	pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
20690Sstevel@tonic-gate 	cur_tg = pii->pii_probes[pr_ndx].pr_target;
20700Sstevel@tonic-gate 	probe_fail_info(pii, cur_tg, &pfinfo);
20710Sstevel@tonic-gate 
20720Sstevel@tonic-gate 	/* Get the time of first failure, for later use */
20730Sstevel@tonic-gate 	pi_tff = pfinfo.pf_tff;
20740Sstevel@tonic-gate 
20750Sstevel@tonic-gate 	/*
20760Sstevel@tonic-gate 	 * If the current target has not responded to the
20770Sstevel@tonic-gate 	 * last NUM_PROBE_FAILS probes, and other targets are
20780Sstevel@tonic-gate 	 * responding delete this target. Dead gateway detection
20790Sstevel@tonic-gate 	 * will eventually remove this target (if router) from the
20800Sstevel@tonic-gate 	 * routing tables. If that does not occur, we may end
20810Sstevel@tonic-gate 	 * up adding this to our list again.
20820Sstevel@tonic-gate 	 */
20830Sstevel@tonic-gate 	if (pfinfo.pf_nfail < NUM_PROBE_FAILS &&
20840Sstevel@tonic-gate 	    pfinfo.pf_nfail_tg >= NUM_PROBE_FAILS) {
20850Sstevel@tonic-gate 		if (pii->pii_targets_are_routers) {
20860Sstevel@tonic-gate 			if (cur_tg->tg_status == TG_ACTIVE)
20870Sstevel@tonic-gate 				pii->pii_ntargets--;
20880Sstevel@tonic-gate 			cur_tg->tg_status = TG_DEAD;
20890Sstevel@tonic-gate 			cur_tg->tg_crtt = 0;
20900Sstevel@tonic-gate 			cur_tg->tg_rtt_sa = -1;
20910Sstevel@tonic-gate 			cur_tg->tg_rtt_sd = 0;
20920Sstevel@tonic-gate 			if (pii->pii_target_next == cur_tg)
20930Sstevel@tonic-gate 				pii->pii_target_next = target_next(cur_tg);
20940Sstevel@tonic-gate 		} else {
20950Sstevel@tonic-gate 			target_delete(cur_tg);
20968485SPeter.Memishian@Sun.COM 			probe(pii, PROBE_MULTI, gethrtime());
20970Sstevel@tonic-gate 		}
20980Sstevel@tonic-gate 		return (PHYINT_OK);
20990Sstevel@tonic-gate 	}
21000Sstevel@tonic-gate 
21010Sstevel@tonic-gate 	/*
21020Sstevel@tonic-gate 	 * If the phyint has lost NUM_PROBE_FAILS or more
21030Sstevel@tonic-gate 	 * consecutive probes, on both IPv4 and IPv6 protocol
21040Sstevel@tonic-gate 	 * instances of the phyint, then trigger failure
21050Sstevel@tonic-gate 	 * detection, else return false
21060Sstevel@tonic-gate 	 */
21070Sstevel@tonic-gate 	if (pfinfo.pf_nfail < NUM_PROBE_FAILS)
21080Sstevel@tonic-gate 		return (PHYINT_OK);
21090Sstevel@tonic-gate 
21100Sstevel@tonic-gate 	pii_other = phyint_inst_other(pii);
21110Sstevel@tonic-gate 	if (PROBE_CAPABLE(pii_other)) {
21120Sstevel@tonic-gate 		probe_fail_info(pii_other, NULL, &pfinfo);
21130Sstevel@tonic-gate 		if (pfinfo.pf_nfail >= NUM_PROBE_FAILS) {
21140Sstevel@tonic-gate 			/*
21150Sstevel@tonic-gate 			 * We have NUM_PROBE_FAILS or more failures
21160Sstevel@tonic-gate 			 * on both IPv4 and IPv6. Get the earliest
21170Sstevel@tonic-gate 			 * time when failure was detected on this
21180Sstevel@tonic-gate 			 * phyint across IPv4 and IPv6.
21190Sstevel@tonic-gate 			 */
21200Sstevel@tonic-gate 			if (TIME_LT(pfinfo.pf_tff, pi_tff))
21210Sstevel@tonic-gate 				pi_tff = pfinfo.pf_tff;
21220Sstevel@tonic-gate 		} else {
21230Sstevel@tonic-gate 			/*
21240Sstevel@tonic-gate 			 * This instance has < NUM_PROBE_FAILS failure.
21250Sstevel@tonic-gate 			 * So return false
21260Sstevel@tonic-gate 			 */
21270Sstevel@tonic-gate 			return (PHYINT_OK);
21280Sstevel@tonic-gate 		}
21290Sstevel@tonic-gate 	}
21300Sstevel@tonic-gate 	*tff = pi_tff;
21310Sstevel@tonic-gate 	return (PHYINT_FAILURE);
21320Sstevel@tonic-gate }
21330Sstevel@tonic-gate 
21340Sstevel@tonic-gate /*
21350Sstevel@tonic-gate  * Check if the link has gone down on this phyint, or it has failed the
21360Sstevel@tonic-gate  * last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6.
21370Sstevel@tonic-gate  * Also look at other phyints of this group, for group failures.
21380Sstevel@tonic-gate  */
21390Sstevel@tonic-gate int
failure_state(struct phyint_instance * pii)21400Sstevel@tonic-gate failure_state(struct phyint_instance *pii)
21410Sstevel@tonic-gate {
21420Sstevel@tonic-gate 	struct	probe_success_count psinfo;
21430Sstevel@tonic-gate 	uint_t	pi2_tls;		/* time last success */
21440Sstevel@tonic-gate 	uint_t	pi_tff;			/* time first fail */
21458485SPeter.Memishian@Sun.COM 	struct	phyint *pi2;
21460Sstevel@tonic-gate 	struct	phyint *pi;
21470Sstevel@tonic-gate 	struct	phyint_instance *pii2;
21480Sstevel@tonic-gate 	struct  phyint_group *pg;
21498485SPeter.Memishian@Sun.COM 	int	retval;
21500Sstevel@tonic-gate 
21518485SPeter.Memishian@Sun.COM 	if (debug & D_FAILREP)
21520Sstevel@tonic-gate 		logdebug("phyint_failed(%s)\n", pii->pii_name);
21530Sstevel@tonic-gate 
21540Sstevel@tonic-gate 	pi = pii->pii_phyint;
21550Sstevel@tonic-gate 	pg = pi->pi_group;
21560Sstevel@tonic-gate 
21570Sstevel@tonic-gate 	if (LINK_UP(pi) && phyint_inst_probe_failure_state(pii, &pi_tff) ==
21584929Srk129064 	    PHYINT_OK)
21590Sstevel@tonic-gate 		return (PHYINT_OK);
21600Sstevel@tonic-gate 
21610Sstevel@tonic-gate 	/*
21628485SPeter.Memishian@Sun.COM 	 * At this point, the link is down, or the phyint is suspect, as it
21638485SPeter.Memishian@Sun.COM 	 * has lost NUM_PROBE_FAILS or more probes. If the phyint does not
21648485SPeter.Memishian@Sun.COM 	 * belong to any group, this is a PHYINT_FAILURE.  Otherwise, continue
21658485SPeter.Memishian@Sun.COM 	 * on to determine whether this should be considered a PHYINT_FAILURE
21668485SPeter.Memishian@Sun.COM 	 * or GROUP_FAILURE.
21670Sstevel@tonic-gate 	 */
21688485SPeter.Memishian@Sun.COM 	if (pg == phyint_anongroup)
21690Sstevel@tonic-gate 		return (PHYINT_FAILURE);
21700Sstevel@tonic-gate 
21710Sstevel@tonic-gate 	/*
21720Sstevel@tonic-gate 	 * Need to compare against other phyints of the same group
21730Sstevel@tonic-gate 	 * to exclude group failures. If the failure was detected via
21740Sstevel@tonic-gate 	 * probing, then if the time of last success (tls) of any
21750Sstevel@tonic-gate 	 * phyint is more recent than the time of first fail (tff) of the
21760Sstevel@tonic-gate 	 * phyint in question, and the link is up on the phyint,
21770Sstevel@tonic-gate 	 * then it is a phyint failure. Otherwise it is a group failure.
21780Sstevel@tonic-gate 	 * If failure was detected via a link down notification sent from
21790Sstevel@tonic-gate 	 * the driver to IP, we see if any phyints in the group are still
21800Sstevel@tonic-gate 	 * running and haven't received a link down notification.  We
21810Sstevel@tonic-gate 	 * will usually be processing the link down notification shortly
21820Sstevel@tonic-gate 	 * after it was received, so there is no point looking at the tls
21830Sstevel@tonic-gate 	 * of other phyints.
21840Sstevel@tonic-gate 	 */
21858485SPeter.Memishian@Sun.COM 	retval = GROUP_FAILURE;
21860Sstevel@tonic-gate 	for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) {
21870Sstevel@tonic-gate 		/* Exclude ourself from comparison */
21880Sstevel@tonic-gate 		if (pi2 == pi)
21890Sstevel@tonic-gate 			continue;
21900Sstevel@tonic-gate 
21910Sstevel@tonic-gate 		if (LINK_DOWN(pi)) {
21920Sstevel@tonic-gate 			/*
21938485SPeter.Memishian@Sun.COM 			 * We use FLAGS_TO_LINK_STATE() to test the flags
21948485SPeter.Memishian@Sun.COM 			 * directly, rather then LINK_UP() or LINK_DOWN(), as
21958485SPeter.Memishian@Sun.COM 			 * we may not have got round to processing the link
21968485SPeter.Memishian@Sun.COM 			 * state for the other phyints in the group yet.
21970Sstevel@tonic-gate 			 *
21988485SPeter.Memishian@Sun.COM 			 * The check for PI_RUNNING and group failure handles
21998485SPeter.Memishian@Sun.COM 			 * the case when the group begins to recover.
22008485SPeter.Memishian@Sun.COM 			 * PI_RUNNING will be set, and group failure cleared
22018485SPeter.Memishian@Sun.COM 			 * only after receipt of NUM_PROBE_REPAIRS, by which
22028485SPeter.Memishian@Sun.COM 			 * time the other phyints should have received at
22038485SPeter.Memishian@Sun.COM 			 * least 1 packet, and so will not have NUM_PROBE_FAILS.
22040Sstevel@tonic-gate 			 */
22050Sstevel@tonic-gate 			if ((pi2->pi_state == PI_RUNNING) &&
22068485SPeter.Memishian@Sun.COM 			    !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) {
22078485SPeter.Memishian@Sun.COM 				retval = PHYINT_FAILURE;
22088485SPeter.Memishian@Sun.COM 				break;
22098485SPeter.Memishian@Sun.COM 			}
22108485SPeter.Memishian@Sun.COM 			continue;
22118485SPeter.Memishian@Sun.COM 		}
22128485SPeter.Memishian@Sun.COM 
22138485SPeter.Memishian@Sun.COM 		if (LINK_DOWN(pi2))
22148485SPeter.Memishian@Sun.COM 			continue;
22158485SPeter.Memishian@Sun.COM 
22168485SPeter.Memishian@Sun.COM 		/*
22178485SPeter.Memishian@Sun.COM 		 * If there's no probe-based failure detection on this
22188485SPeter.Memishian@Sun.COM 		 * interface, and its link is still up, then it's still
22198485SPeter.Memishian@Sun.COM 		 * working and thus the group has not failed.
22208485SPeter.Memishian@Sun.COM 		 */
22218485SPeter.Memishian@Sun.COM 		if (!PROBE_ENABLED(pi2->pi_v4) && !PROBE_ENABLED(pi2->pi_v6)) {
22228485SPeter.Memishian@Sun.COM 			retval = PHYINT_FAILURE;
22238485SPeter.Memishian@Sun.COM 			break;
22248485SPeter.Memishian@Sun.COM 		}
22258485SPeter.Memishian@Sun.COM 
22268485SPeter.Memishian@Sun.COM 		/*
22278485SPeter.Memishian@Sun.COM 		 * Need to compare against both IPv4 and IPv6 instances.
22288485SPeter.Memishian@Sun.COM 		 */
22298485SPeter.Memishian@Sun.COM 		pii2 = pi2->pi_v4;
22308485SPeter.Memishian@Sun.COM 		if (pii2 != NULL) {
22318485SPeter.Memishian@Sun.COM 			probe_success_info(pii2, NULL, &psinfo);
22328485SPeter.Memishian@Sun.COM 			if (psinfo.ps_tls_valid) {
22338485SPeter.Memishian@Sun.COM 				pi2_tls = psinfo.ps_tls;
22348485SPeter.Memishian@Sun.COM 				/*
22358485SPeter.Memishian@Sun.COM 				 * See comment above regarding check
22368485SPeter.Memishian@Sun.COM 				 * for PI_RUNNING and group failure.
22378485SPeter.Memishian@Sun.COM 				 */
22388485SPeter.Memishian@Sun.COM 				if (TIME_GT(pi2_tls, pi_tff) &&
22398485SPeter.Memishian@Sun.COM 				    (pi2->pi_state == PI_RUNNING) &&
22408485SPeter.Memishian@Sun.COM 				    !GROUP_FAILED(pg) &&
22418485SPeter.Memishian@Sun.COM 				    FLAGS_TO_LINK_STATE(pi2)) {
22428485SPeter.Memishian@Sun.COM 					retval = PHYINT_FAILURE;
22438485SPeter.Memishian@Sun.COM 					break;
22440Sstevel@tonic-gate 				}
22450Sstevel@tonic-gate 			}
22468485SPeter.Memishian@Sun.COM 		}
22470Sstevel@tonic-gate 
22488485SPeter.Memishian@Sun.COM 		pii2 = pi2->pi_v6;
22498485SPeter.Memishian@Sun.COM 		if (pii2 != NULL) {
22508485SPeter.Memishian@Sun.COM 			probe_success_info(pii2, NULL, &psinfo);
22518485SPeter.Memishian@Sun.COM 			if (psinfo.ps_tls_valid) {
22528485SPeter.Memishian@Sun.COM 				pi2_tls = psinfo.ps_tls;
22538485SPeter.Memishian@Sun.COM 				/*
22548485SPeter.Memishian@Sun.COM 				 * See comment above regarding check
22558485SPeter.Memishian@Sun.COM 				 * for PI_RUNNING and group failure.
22568485SPeter.Memishian@Sun.COM 				 */
22578485SPeter.Memishian@Sun.COM 				if (TIME_GT(pi2_tls, pi_tff) &&
22588485SPeter.Memishian@Sun.COM 				    (pi2->pi_state == PI_RUNNING) &&
22598485SPeter.Memishian@Sun.COM 				    !GROUP_FAILED(pg) &&
22608485SPeter.Memishian@Sun.COM 				    FLAGS_TO_LINK_STATE(pi2)) {
22618485SPeter.Memishian@Sun.COM 					retval = PHYINT_FAILURE;
22628485SPeter.Memishian@Sun.COM 					break;
22630Sstevel@tonic-gate 				}
22640Sstevel@tonic-gate 			}
22650Sstevel@tonic-gate 		}
22660Sstevel@tonic-gate 	}
22670Sstevel@tonic-gate 
22680Sstevel@tonic-gate 	/*
22698485SPeter.Memishian@Sun.COM 	 * Update the group state to account for the changes.
22700Sstevel@tonic-gate 	 */
22718485SPeter.Memishian@Sun.COM 	phyint_group_refresh_state(pg);
22728485SPeter.Memishian@Sun.COM 	return (retval);
22730Sstevel@tonic-gate }
22740Sstevel@tonic-gate 
22750Sstevel@tonic-gate /*
22760Sstevel@tonic-gate  * Return the information associated with consecutive probe successes
22770Sstevel@tonic-gate  * starting with the most recent probe. At most the last 2 probes can be
22780Sstevel@tonic-gate  * in the unacknowledged state. All previous probes have either failed
22790Sstevel@tonic-gate  * or succeeded.
22800Sstevel@tonic-gate  */
22810Sstevel@tonic-gate static void
probe_success_info(struct phyint_instance * pii,struct target * cur_tg,struct probe_success_count * psinfo)22820Sstevel@tonic-gate probe_success_info(struct phyint_instance *pii, struct target *cur_tg,
22830Sstevel@tonic-gate     struct probe_success_count *psinfo)
22840Sstevel@tonic-gate {
22850Sstevel@tonic-gate 	uint_t	i;
22860Sstevel@tonic-gate 	struct probe_stats *pr_statp;
22870Sstevel@tonic-gate 	uint_t most_recent;
22880Sstevel@tonic-gate 	uint_t second_most_recent;
22890Sstevel@tonic-gate 	boolean_t pi_found_failure = _B_FALSE;
22900Sstevel@tonic-gate 	boolean_t tg_found_failure = _B_FALSE;
22910Sstevel@tonic-gate 	uint_t now;
22920Sstevel@tonic-gate 	uint_t timeout;
22930Sstevel@tonic-gate 	struct target *tg;
22940Sstevel@tonic-gate 
22958485SPeter.Memishian@Sun.COM 	if (debug & D_FAILREP)
22960Sstevel@tonic-gate 		logdebug("probe_success_info(%s)\n", pii->pii_name);
22970Sstevel@tonic-gate 
22980Sstevel@tonic-gate 	bzero(psinfo, sizeof (*psinfo));
22990Sstevel@tonic-gate 	now = getcurrenttime();
23000Sstevel@tonic-gate 
23010Sstevel@tonic-gate 	/*
23020Sstevel@tonic-gate 	 * Start with the most recent probe, and count the number
23030Sstevel@tonic-gate 	 * of consecutive probe successes. Latch the number of successes
23040Sstevel@tonic-gate 	 * on hitting a failure.
23050Sstevel@tonic-gate 	 */
23060Sstevel@tonic-gate 	most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
23070Sstevel@tonic-gate 	second_most_recent = PROBE_INDEX_PREV(most_recent);
23080Sstevel@tonic-gate 
23090Sstevel@tonic-gate 	for (i = most_recent; i != pii->pii_probe_next;
23100Sstevel@tonic-gate 	    i = PROBE_INDEX_PREV(i)) {
23110Sstevel@tonic-gate 		pr_statp = &pii->pii_probes[i];
23120Sstevel@tonic-gate 
23130Sstevel@tonic-gate 		switch (pr_statp->pr_status) {
23140Sstevel@tonic-gate 		case PR_UNACKED:
23150Sstevel@tonic-gate 			/*
23160Sstevel@tonic-gate 			 * Only the most recent 2 probes can be unacknowledged
23170Sstevel@tonic-gate 			 */
23180Sstevel@tonic-gate 			assert(i == most_recent || i == second_most_recent);
23190Sstevel@tonic-gate 
23200Sstevel@tonic-gate 			tg = pr_statp->pr_target;
23210Sstevel@tonic-gate 			assert(tg != NULL);
23220Sstevel@tonic-gate 			/*
23230Sstevel@tonic-gate 			 * The crtt could be zero for some reason,
23240Sstevel@tonic-gate 			 * Eg. the phyint could be failed. If the crtt is
23250Sstevel@tonic-gate 			 * not available use the value of the group's probe
23260Sstevel@tonic-gate 			 * interval which is a worst case estimate.
23270Sstevel@tonic-gate 			 */
23288485SPeter.Memishian@Sun.COM 			timeout = ns2ms(pr_statp->pr_hrtime_start);
23290Sstevel@tonic-gate 			if (tg->tg_crtt != 0) {
23308485SPeter.Memishian@Sun.COM 				timeout += tg->tg_crtt;
23310Sstevel@tonic-gate 			} else {
23328485SPeter.Memishian@Sun.COM 				timeout +=
23330Sstevel@tonic-gate 				    pii->pii_phyint->pi_group->pg_probeint;
23340Sstevel@tonic-gate 			}
23350Sstevel@tonic-gate 
23360Sstevel@tonic-gate 			if (TIME_LT(timeout, now)) {
23370Sstevel@tonic-gate 				/*
23380Sstevel@tonic-gate 				 * We hit a failure. Latch the total number of
23390Sstevel@tonic-gate 				 * recent consecutive successes.
23400Sstevel@tonic-gate 				 */
23410Sstevel@tonic-gate 				pr_statp->pr_time_lost = timeout;
23428485SPeter.Memishian@Sun.COM 				probe_chstate(pr_statp, pii, PR_LOST);
23430Sstevel@tonic-gate 				pi_found_failure = _B_TRUE;
23440Sstevel@tonic-gate 				if (cur_tg != NULL && tg == cur_tg) {
23450Sstevel@tonic-gate 					/*
23460Sstevel@tonic-gate 					 * We hit a failure for the desired
23470Sstevel@tonic-gate 					 * target. Latch the number of recent
23480Sstevel@tonic-gate 					 * consecutive successes for this target
23490Sstevel@tonic-gate 					 */
23500Sstevel@tonic-gate 					tg_found_failure = _B_TRUE;
23510Sstevel@tonic-gate 				}
23520Sstevel@tonic-gate 			}
23530Sstevel@tonic-gate 			break;
23540Sstevel@tonic-gate 
23550Sstevel@tonic-gate 		case PR_ACKED:
23560Sstevel@tonic-gate 			/*
23570Sstevel@tonic-gate 			 * Bump up the count of probe successes, if we
23580Sstevel@tonic-gate 			 * have not seen any failure so far.
23590Sstevel@tonic-gate 			 */
23600Sstevel@tonic-gate 			if (!pi_found_failure)
23610Sstevel@tonic-gate 				psinfo->ps_nsucc++;
23620Sstevel@tonic-gate 
23630Sstevel@tonic-gate 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
23640Sstevel@tonic-gate 			    !tg_found_failure) {
23650Sstevel@tonic-gate 				psinfo->ps_nsucc_tg++;
23660Sstevel@tonic-gate 			}
23670Sstevel@tonic-gate 
23680Sstevel@tonic-gate 			/*
23690Sstevel@tonic-gate 			 * Record the time of last success, if this is
23700Sstevel@tonic-gate 			 * the most recent probe success.
23710Sstevel@tonic-gate 			 */
23720Sstevel@tonic-gate 			if (!psinfo->ps_tls_valid) {
23738485SPeter.Memishian@Sun.COM 				psinfo->ps_tls =
23748485SPeter.Memishian@Sun.COM 				    ns2ms(pr_statp->pr_hrtime_ackproc);
23750Sstevel@tonic-gate 				psinfo->ps_tls_valid = _B_TRUE;
23760Sstevel@tonic-gate 			}
23770Sstevel@tonic-gate 			break;
23780Sstevel@tonic-gate 
23790Sstevel@tonic-gate 		case PR_LOST:
23800Sstevel@tonic-gate 			/*
23810Sstevel@tonic-gate 			 * We hit a failure. Latch the total number of
23820Sstevel@tonic-gate 			 * recent consecutive successes.
23830Sstevel@tonic-gate 			 */
23840Sstevel@tonic-gate 			pi_found_failure = _B_TRUE;
23850Sstevel@tonic-gate 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
23860Sstevel@tonic-gate 				/*
23870Sstevel@tonic-gate 				 * We hit a failure for the desired target.
23880Sstevel@tonic-gate 				 * Latch the number of recent consecutive
23890Sstevel@tonic-gate 				 * successes for this target
23900Sstevel@tonic-gate 				 */
23910Sstevel@tonic-gate 				tg_found_failure = _B_TRUE;
23920Sstevel@tonic-gate 			}
23930Sstevel@tonic-gate 			break;
23940Sstevel@tonic-gate 
23950Sstevel@tonic-gate 		default:
23960Sstevel@tonic-gate 			return;
23970Sstevel@tonic-gate 
23980Sstevel@tonic-gate 		}
23990Sstevel@tonic-gate 	}
24000Sstevel@tonic-gate }
24010Sstevel@tonic-gate 
24020Sstevel@tonic-gate /*
24030Sstevel@tonic-gate  * Return the information associated with consecutive probe failures
24040Sstevel@tonic-gate  * starting with the most recent probe. Only the last 2 probes can be in the
24050Sstevel@tonic-gate  * unacknowledged state. All previous probes have either failed or succeeded.
24060Sstevel@tonic-gate  */
24070Sstevel@tonic-gate static void
probe_fail_info(struct phyint_instance * pii,struct target * cur_tg,struct probe_fail_count * pfinfo)24080Sstevel@tonic-gate probe_fail_info(struct phyint_instance *pii, struct target *cur_tg,
24090Sstevel@tonic-gate     struct probe_fail_count *pfinfo)
24100Sstevel@tonic-gate {
24110Sstevel@tonic-gate 	int	i;
24120Sstevel@tonic-gate 	struct probe_stats *pr_statp;
24130Sstevel@tonic-gate 	boolean_t	tg_found_success = _B_FALSE;
24140Sstevel@tonic-gate 	boolean_t	pi_found_success = _B_FALSE;
24150Sstevel@tonic-gate 	int	most_recent;
24160Sstevel@tonic-gate 	int	second_most_recent;
24170Sstevel@tonic-gate 	uint_t	now;
24180Sstevel@tonic-gate 	uint_t	timeout;
24190Sstevel@tonic-gate 	struct	target *tg;
24200Sstevel@tonic-gate 
24218485SPeter.Memishian@Sun.COM 	if (debug & D_FAILREP)
24220Sstevel@tonic-gate 		logdebug("probe_fail_info(%s)\n", pii->pii_name);
24230Sstevel@tonic-gate 
24240Sstevel@tonic-gate 	bzero(pfinfo, sizeof (*pfinfo));
24250Sstevel@tonic-gate 	now = getcurrenttime();
24260Sstevel@tonic-gate 
24270Sstevel@tonic-gate 	/*
24280Sstevel@tonic-gate 	 * Start with the most recent probe, and count the number
24290Sstevel@tonic-gate 	 * of consecutive probe failures. Latch the number of failures
24300Sstevel@tonic-gate 	 * on hitting a probe success.
24310Sstevel@tonic-gate 	 */
24320Sstevel@tonic-gate 	most_recent = PROBE_INDEX_PREV(pii->pii_probe_next);
24330Sstevel@tonic-gate 	second_most_recent = PROBE_INDEX_PREV(most_recent);
24340Sstevel@tonic-gate 
24350Sstevel@tonic-gate 	for (i = most_recent; i != pii->pii_probe_next;
24360Sstevel@tonic-gate 	    i = PROBE_INDEX_PREV(i)) {
24370Sstevel@tonic-gate 		pr_statp = &pii->pii_probes[i];
24380Sstevel@tonic-gate 
24390Sstevel@tonic-gate 		assert(PR_STATUS_VALID(pr_statp->pr_status));
24400Sstevel@tonic-gate 
24410Sstevel@tonic-gate 		switch (pr_statp->pr_status) {
24420Sstevel@tonic-gate 		case PR_UNACKED:
24430Sstevel@tonic-gate 			/*
24440Sstevel@tonic-gate 			 * Only the most recent 2 probes can be unacknowledged
24450Sstevel@tonic-gate 			 */
24460Sstevel@tonic-gate 			assert(i == most_recent || i == second_most_recent);
24470Sstevel@tonic-gate 
24480Sstevel@tonic-gate 			tg = pr_statp->pr_target;
24490Sstevel@tonic-gate 			/*
24500Sstevel@tonic-gate 			 * Target is guaranteed to exist in the unack. state
24510Sstevel@tonic-gate 			 */
24520Sstevel@tonic-gate 			assert(tg != NULL);
24530Sstevel@tonic-gate 			/*
24540Sstevel@tonic-gate 			 * The crtt could be zero for some reason,
24550Sstevel@tonic-gate 			 * Eg. the phyint could be failed. If the crtt is
24560Sstevel@tonic-gate 			 * not available use the group's probe interval,
24570Sstevel@tonic-gate 			 * which is a worst case estimate.
24580Sstevel@tonic-gate 			 */
24598485SPeter.Memishian@Sun.COM 			timeout = ns2ms(pr_statp->pr_hrtime_start);
24600Sstevel@tonic-gate 			if (tg->tg_crtt != 0) {
24618485SPeter.Memishian@Sun.COM 				timeout += tg->tg_crtt;
24620Sstevel@tonic-gate 			} else {
24638485SPeter.Memishian@Sun.COM 				timeout +=
24640Sstevel@tonic-gate 				    pii->pii_phyint->pi_group->pg_probeint;
24650Sstevel@tonic-gate 			}
24660Sstevel@tonic-gate 
24670Sstevel@tonic-gate 			if (TIME_GT(timeout, now))
24680Sstevel@tonic-gate 				break;
24690Sstevel@tonic-gate 
24700Sstevel@tonic-gate 			pr_statp->pr_time_lost = timeout;
24718485SPeter.Memishian@Sun.COM 			probe_chstate(pr_statp, pii, PR_LOST);
24720Sstevel@tonic-gate 			/* FALLTHRU */
24730Sstevel@tonic-gate 
24740Sstevel@tonic-gate 		case PR_LOST:
24750Sstevel@tonic-gate 			if (!pi_found_success) {
24760Sstevel@tonic-gate 				pfinfo->pf_nfail++;
24770Sstevel@tonic-gate 				pfinfo->pf_tff = pr_statp->pr_time_lost;
24780Sstevel@tonic-gate 			}
24790Sstevel@tonic-gate 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg &&
24800Sstevel@tonic-gate 			    !tg_found_success)  {
24810Sstevel@tonic-gate 				pfinfo->pf_nfail_tg++;
24820Sstevel@tonic-gate 			}
24830Sstevel@tonic-gate 			break;
24840Sstevel@tonic-gate 
24850Sstevel@tonic-gate 		default:
24860Sstevel@tonic-gate 			/*
24870Sstevel@tonic-gate 			 * We hit a success or unused slot. Latch the
24880Sstevel@tonic-gate 			 * total number of recent consecutive failures.
24890Sstevel@tonic-gate 			 */
24900Sstevel@tonic-gate 			pi_found_success = _B_TRUE;
24910Sstevel@tonic-gate 			if (cur_tg != NULL && pr_statp->pr_target == cur_tg) {
24920Sstevel@tonic-gate 				/*
24930Sstevel@tonic-gate 				 * We hit a success for the desired target.
24940Sstevel@tonic-gate 				 * Latch the number of recent consecutive
24950Sstevel@tonic-gate 				 * failures for this target
24960Sstevel@tonic-gate 				 */
24970Sstevel@tonic-gate 				tg_found_success = _B_TRUE;
24980Sstevel@tonic-gate 			}
24990Sstevel@tonic-gate 		}
25000Sstevel@tonic-gate 	}
25010Sstevel@tonic-gate }
25020Sstevel@tonic-gate 
25030Sstevel@tonic-gate /*
25048485SPeter.Memishian@Sun.COM  * Change the state of probe `pr' on phyint_instance `pii' to state `state'.
25058485SPeter.Memishian@Sun.COM  */
25068485SPeter.Memishian@Sun.COM void
probe_chstate(struct probe_stats * pr,struct phyint_instance * pii,int state)25078485SPeter.Memishian@Sun.COM probe_chstate(struct probe_stats *pr, struct phyint_instance *pii, int state)
25088485SPeter.Memishian@Sun.COM {
25098485SPeter.Memishian@Sun.COM 	if (pr->pr_status == state)
25108485SPeter.Memishian@Sun.COM 		return;
25118485SPeter.Memishian@Sun.COM 
25128485SPeter.Memishian@Sun.COM 	pr->pr_status = state;
25138485SPeter.Memishian@Sun.COM 	(void) probe_state_event(pr, pii);
25148485SPeter.Memishian@Sun.COM }
25158485SPeter.Memishian@Sun.COM 
25168485SPeter.Memishian@Sun.COM /*
25170Sstevel@tonic-gate  * Check if the phyint has been repaired.  If no test address has been
25180Sstevel@tonic-gate  * configured, then consider the interface repaired if the link is up (unless
25190Sstevel@tonic-gate  * the link is flapping; see below).  Otherwise, look for proof of probes
25200Sstevel@tonic-gate  * being sent and received. If last NUM_PROBE_REPAIRS probes are fine on
25210Sstevel@tonic-gate  * either IPv4 or IPv6 instance, the phyint can be considered repaired.
25220Sstevel@tonic-gate  */
25230Sstevel@tonic-gate static boolean_t
phyint_repaired(struct phyint * pi)25240Sstevel@tonic-gate phyint_repaired(struct phyint *pi)
25250Sstevel@tonic-gate {
25260Sstevel@tonic-gate 	struct	probe_success_count psinfo;
25270Sstevel@tonic-gate 	struct	phyint_instance *pii;
25280Sstevel@tonic-gate 	struct	target *cur_tg;
25290Sstevel@tonic-gate 	int	pr_ndx;
25300Sstevel@tonic-gate 	uint_t	cur_time;
25310Sstevel@tonic-gate 
25328485SPeter.Memishian@Sun.COM 	if (debug & D_FAILREP)
25330Sstevel@tonic-gate 		logdebug("phyint_repaired(%s)\n", pi->pi_name);
25340Sstevel@tonic-gate 
25350Sstevel@tonic-gate 	if (LINK_DOWN(pi))
25360Sstevel@tonic-gate 		return (_B_FALSE);
25370Sstevel@tonic-gate 
25380Sstevel@tonic-gate 	/*
25390Sstevel@tonic-gate 	 * If we don't have any test addresses and the link is up, then
25400Sstevel@tonic-gate 	 * consider the interface repaired, unless we've received more than
25410Sstevel@tonic-gate 	 * LINK_UP_PERMIN link up notifications in the last minute, in
25420Sstevel@tonic-gate 	 * which case we keep the link down until we drop back below
25430Sstevel@tonic-gate 	 * the threshold.
25440Sstevel@tonic-gate 	 */
25450Sstevel@tonic-gate 	if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) {
25460Sstevel@tonic-gate 		cur_time = getcurrenttime();
25470Sstevel@tonic-gate 		if ((pi->pi_whenup[pi->pi_whendx] == 0 ||
25480Sstevel@tonic-gate 		    (cur_time - pi->pi_whenup[pi->pi_whendx]) > MSEC_PERMIN)) {
25490Sstevel@tonic-gate 			pi->pi_lfmsg_printed = 0;
25500Sstevel@tonic-gate 			return (_B_TRUE);
25510Sstevel@tonic-gate 		}
25520Sstevel@tonic-gate 		if (!pi->pi_lfmsg_printed) {
25530Sstevel@tonic-gate 			logerr("The link has come up on %s more than %d times "
25548485SPeter.Memishian@Sun.COM 			    "in the last minute; disabling repair until it "
25550Sstevel@tonic-gate 			    "stabilizes\n", pi->pi_name, LINK_UP_PERMIN);
25560Sstevel@tonic-gate 			pi->pi_lfmsg_printed = 1;
25570Sstevel@tonic-gate 		}
25580Sstevel@tonic-gate 
25590Sstevel@tonic-gate 		return (_B_FALSE);
25600Sstevel@tonic-gate 	}
25610Sstevel@tonic-gate 
25620Sstevel@tonic-gate 	pii = pi->pi_v4;
25630Sstevel@tonic-gate 	if (PROBE_CAPABLE(pii)) {
25640Sstevel@tonic-gate 		pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
25650Sstevel@tonic-gate 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
25660Sstevel@tonic-gate 		probe_success_info(pii, cur_tg, &psinfo);
25670Sstevel@tonic-gate 		if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
25680Sstevel@tonic-gate 		    psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
25690Sstevel@tonic-gate 			return (_B_TRUE);
25700Sstevel@tonic-gate 	}
25710Sstevel@tonic-gate 
25720Sstevel@tonic-gate 	pii = pi->pi_v6;
25730Sstevel@tonic-gate 	if (PROBE_CAPABLE(pii)) {
25740Sstevel@tonic-gate 		pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next);
25750Sstevel@tonic-gate 		cur_tg = pii->pii_probes[pr_ndx].pr_target;
25760Sstevel@tonic-gate 		probe_success_info(pii, cur_tg, &psinfo);
25770Sstevel@tonic-gate 		if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS ||
25780Sstevel@tonic-gate 		    psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS)
25790Sstevel@tonic-gate 			return (_B_TRUE);
25800Sstevel@tonic-gate 	}
25810Sstevel@tonic-gate 
25820Sstevel@tonic-gate 	return (_B_FALSE);
25830Sstevel@tonic-gate }
25840Sstevel@tonic-gate 
25850Sstevel@tonic-gate /*
25860Sstevel@tonic-gate  * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call.
25870Sstevel@tonic-gate  */
25880Sstevel@tonic-gate boolean_t
change_pif_flags(struct phyint * pi,uint64_t set,uint64_t clear)25898485SPeter.Memishian@Sun.COM change_pif_flags(struct phyint *pi, uint64_t set, uint64_t clear)
25900Sstevel@tonic-gate {
25910Sstevel@tonic-gate 	int ifsock;
25920Sstevel@tonic-gate 	struct lifreq lifr;
25934929Srk129064 	uint64_t old_flags;
25940Sstevel@tonic-gate 
25958485SPeter.Memishian@Sun.COM 	if (debug & D_FAILREP) {
25968485SPeter.Memishian@Sun.COM 		logdebug("change_pif_flags(%s): set %llx clear %llx\n",
25978485SPeter.Memishian@Sun.COM 		    pi->pi_name, set, clear);
25980Sstevel@tonic-gate 	}
25990Sstevel@tonic-gate 
26008485SPeter.Memishian@Sun.COM 	if (pi->pi_v4 != NULL)
26010Sstevel@tonic-gate 		ifsock = ifsock_v4;
26028485SPeter.Memishian@Sun.COM 	else
26030Sstevel@tonic-gate 		ifsock = ifsock_v6;
26040Sstevel@tonic-gate 
26050Sstevel@tonic-gate 	/*
26060Sstevel@tonic-gate 	 * Get the current flags from the kernel, and set/clear the
26070Sstevel@tonic-gate 	 * desired phyint flags. Since we set only phyint flags, we can
26080Sstevel@tonic-gate 	 * do it on either IPv4 or IPv6 instance.
26090Sstevel@tonic-gate 	 */
26108485SPeter.Memishian@Sun.COM 	(void) strlcpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name));
26118485SPeter.Memishian@Sun.COM 
26120Sstevel@tonic-gate 	if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) {
26130Sstevel@tonic-gate 		if (errno != ENXIO)
26148485SPeter.Memishian@Sun.COM 			logperror("change_pif_flags: ioctl (get flags)");
26150Sstevel@tonic-gate 		return (_B_FALSE);
26160Sstevel@tonic-gate 	}
26174929Srk129064 
26184929Srk129064 	old_flags = lifr.lifr_flags;
26198485SPeter.Memishian@Sun.COM 	lifr.lifr_flags |= set;
26208485SPeter.Memishian@Sun.COM 	lifr.lifr_flags &= ~clear;
26214929Srk129064 
26224929Srk129064 	if (old_flags == lifr.lifr_flags) {
26234929Srk129064 		/* No change in the flags. No need to send ioctl */
26244929Srk129064 		return (_B_TRUE);
26254929Srk129064 	}
26264929Srk129064 
26270Sstevel@tonic-gate 	if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) {
26280Sstevel@tonic-gate 		if (errno != ENXIO)
26298485SPeter.Memishian@Sun.COM 			logperror("change_pif_flags: ioctl (set flags)");
26300Sstevel@tonic-gate 		return (_B_FALSE);
26310Sstevel@tonic-gate 	}
26320Sstevel@tonic-gate 
26330Sstevel@tonic-gate 	/*
26340Sstevel@tonic-gate 	 * Keep pi_flags in synch. with actual flags. Assumes flags are
26350Sstevel@tonic-gate 	 * phyint flags.
26360Sstevel@tonic-gate 	 */
26378485SPeter.Memishian@Sun.COM 	pi->pi_flags |= set;
26388485SPeter.Memishian@Sun.COM 	pi->pi_flags &= ~clear;
26390Sstevel@tonic-gate 
26408485SPeter.Memishian@Sun.COM 	if (pi->pi_v4 != NULL)
26410Sstevel@tonic-gate 		pi->pi_v4->pii_flags = pi->pi_flags;
26420Sstevel@tonic-gate 
26438485SPeter.Memishian@Sun.COM 	if (pi->pi_v6 != NULL)
26440Sstevel@tonic-gate 		pi->pi_v6->pii_flags = pi->pi_flags;
26450Sstevel@tonic-gate 
26460Sstevel@tonic-gate 	return (_B_TRUE);
26470Sstevel@tonic-gate }
26480Sstevel@tonic-gate 
26490Sstevel@tonic-gate /*
26500Sstevel@tonic-gate  * icmp cksum computation for IPv4.
26510Sstevel@tonic-gate  */
26520Sstevel@tonic-gate static int
in_cksum(ushort_t * addr,int len)26530Sstevel@tonic-gate in_cksum(ushort_t *addr, int len)
26540Sstevel@tonic-gate {
26550Sstevel@tonic-gate 	register int nleft = len;
26560Sstevel@tonic-gate 	register ushort_t *w = addr;
26570Sstevel@tonic-gate 	register ushort_t answer;
26580Sstevel@tonic-gate 	ushort_t odd_byte = 0;
26590Sstevel@tonic-gate 	register int sum = 0;
26600Sstevel@tonic-gate 
26610Sstevel@tonic-gate 	/*
26620Sstevel@tonic-gate 	 *  Our algorithm is simple, using a 32 bit accumulator (sum),
26630Sstevel@tonic-gate 	 *  we add sequential 16 bit words to it, and at the end, fold
26640Sstevel@tonic-gate 	 *  back all the carry bits from the top 16 bits into the lower
26650Sstevel@tonic-gate 	 *  16 bits.
26660Sstevel@tonic-gate 	 */
26670Sstevel@tonic-gate 	while (nleft > 1)  {
26680Sstevel@tonic-gate 		sum += *w++;
26690Sstevel@tonic-gate 		nleft -= 2;
26700Sstevel@tonic-gate 	}
26710Sstevel@tonic-gate 
26720Sstevel@tonic-gate 	/* mop up an odd byte, if necessary */
26730Sstevel@tonic-gate 	if (nleft == 1) {
26740Sstevel@tonic-gate 		*(uchar_t *)(&odd_byte) = *(uchar_t *)w;
26750Sstevel@tonic-gate 		sum += odd_byte;
26760Sstevel@tonic-gate 	}
26770Sstevel@tonic-gate 
26780Sstevel@tonic-gate 	/*
26790Sstevel@tonic-gate 	 * add back carry outs from top 16 bits to low 16 bits
26800Sstevel@tonic-gate 	 */
26810Sstevel@tonic-gate 	sum = (sum >> 16) + (sum & 0xffff);	/* add hi 16 to low 16 */
26820Sstevel@tonic-gate 	sum += (sum >> 16);			/* add carry */
26830Sstevel@tonic-gate 	answer = ~sum;				/* truncate to 16 bits */
26840Sstevel@tonic-gate 	return (answer);
26850Sstevel@tonic-gate }
26860Sstevel@tonic-gate 
26870Sstevel@tonic-gate static void
reset_snxt_basetimes(void)26880Sstevel@tonic-gate reset_snxt_basetimes(void)
26890Sstevel@tonic-gate {
26900Sstevel@tonic-gate 	struct phyint_instance *pii;
26910Sstevel@tonic-gate 
26920Sstevel@tonic-gate 	for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) {
26930Sstevel@tonic-gate 		pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime;
26940Sstevel@tonic-gate 	}
26950Sstevel@tonic-gate }
26960Sstevel@tonic-gate 
26970Sstevel@tonic-gate /*
26980Sstevel@tonic-gate  * Is the address one of our own addresses? Unfortunately,
26990Sstevel@tonic-gate  * we cannot check our phyint tables to determine if the address
27000Sstevel@tonic-gate  * is our own. This is because, we don't track interfaces that
27010Sstevel@tonic-gate  * are not part of any group. We have to either use a 'bind' or
27020Sstevel@tonic-gate  * get the complete list of all interfaces using SIOCGLIFCONF,
27032250Srk129064  * to do this check. We could also use SIOCTMYADDR.
27042250Srk129064  * Bind fails for the local zone address, so we might include local zone
27052250Srk129064  * address as target address. If local zone address is a target address
27062250Srk129064  * and it is up, it is not possible to detect the interface failure.
27072250Srk129064  * SIOCTMYADDR also doesn't consider local zone address as own address.
27082250Srk129064  * So, we choose to use SIOCGLIFCONF to collect the local addresses, and they
27098485SPeter.Memishian@Sun.COM  * are stored in `localaddrs'
27100Sstevel@tonic-gate  */
27112250Srk129064 boolean_t
own_address(struct in6_addr addr)27122250Srk129064 own_address(struct in6_addr addr)
27132250Srk129064 {
27148485SPeter.Memishian@Sun.COM 	addrlist_t *addrp;
27158485SPeter.Memishian@Sun.COM 	struct sockaddr_storage ss;
27168485SPeter.Memishian@Sun.COM 	int af = IN6_IS_ADDR_V4MAPPED(&addr) ? AF_INET : AF_INET6;
27172250Srk129064 
27188485SPeter.Memishian@Sun.COM 	addr2storage(af, &addr, &ss);
27198485SPeter.Memishian@Sun.COM 	for (addrp = localaddrs; addrp != NULL; addrp = addrp->al_next) {
27208485SPeter.Memishian@Sun.COM 		if (sockaddrcmp(&ss, &addrp->al_addr))
27212250Srk129064 			return (_B_TRUE);
27220Sstevel@tonic-gate 	}
27232250Srk129064 	return (_B_FALSE);
27240Sstevel@tonic-gate }
27258485SPeter.Memishian@Sun.COM 
27268485SPeter.Memishian@Sun.COM static int
ns2ms(int64_t ns)27278485SPeter.Memishian@Sun.COM ns2ms(int64_t ns)
27288485SPeter.Memishian@Sun.COM {
27298485SPeter.Memishian@Sun.COM 	return (ns / (NANOSEC / MILLISEC));
27308485SPeter.Memishian@Sun.COM }
27318485SPeter.Memishian@Sun.COM 
27328485SPeter.Memishian@Sun.COM static int64_t
tv2ns(struct timeval * tvp)27338485SPeter.Memishian@Sun.COM tv2ns(struct timeval *tvp)
27348485SPeter.Memishian@Sun.COM {
27358485SPeter.Memishian@Sun.COM 	return (tvp->tv_sec * NANOSEC + tvp->tv_usec * 1000);
27368485SPeter.Memishian@Sun.COM }
2737