10Sstevel@tonic-gate /* 22250Srk129064 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 30Sstevel@tonic-gate * Use is subject to license terms. 40Sstevel@tonic-gate */ 50Sstevel@tonic-gate 60Sstevel@tonic-gate /* 70Sstevel@tonic-gate * Copyright (c) 1987 Regents of the University of California. 80Sstevel@tonic-gate * All rights reserved. 90Sstevel@tonic-gate * 100Sstevel@tonic-gate * Redistribution and use in source and binary forms are permitted 110Sstevel@tonic-gate * provided that the above copyright notice and this paragraph are 120Sstevel@tonic-gate * duplicated in all such forms and that any documentation, 130Sstevel@tonic-gate * advertising materials, and other materials related to such 140Sstevel@tonic-gate * distribution and use acknowledge that the software was developed 150Sstevel@tonic-gate * by the University of California, Berkeley. The name of the 160Sstevel@tonic-gate * University may not be used to endorse or promote products derived 170Sstevel@tonic-gate * from this software without specific prior written permission. 180Sstevel@tonic-gate * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 190Sstevel@tonic-gate * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 200Sstevel@tonic-gate * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. 210Sstevel@tonic-gate */ 220Sstevel@tonic-gate 230Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 240Sstevel@tonic-gate 250Sstevel@tonic-gate #include "mpd_defs.h" 260Sstevel@tonic-gate #include "mpd_tables.h" 270Sstevel@tonic-gate 280Sstevel@tonic-gate /* 290Sstevel@tonic-gate * Probe types for probe() 300Sstevel@tonic-gate */ 310Sstevel@tonic-gate #define PROBE_UNI 0x1234 /* Unicast probe packet */ 320Sstevel@tonic-gate #define PROBE_MULTI 0x5678 /* Multicast probe packet */ 330Sstevel@tonic-gate #define PROBE_RTT 0x9abc /* RTT only probe packet */ 340Sstevel@tonic-gate 350Sstevel@tonic-gate #define MSEC_PERMIN (60 * MILLISEC) /* Number of milliseconds in a minute */ 360Sstevel@tonic-gate 370Sstevel@tonic-gate /* 380Sstevel@tonic-gate * Format of probe / probe response packets. This is an ICMP Echo request 390Sstevel@tonic-gate * or ICMP Echo reply. Packet format is same for both IPv4 and IPv6 400Sstevel@tonic-gate */ 410Sstevel@tonic-gate struct pr_icmp 420Sstevel@tonic-gate { 430Sstevel@tonic-gate uint8_t pr_icmp_type; /* type field */ 440Sstevel@tonic-gate uint8_t pr_icmp_code; /* code field */ 450Sstevel@tonic-gate uint16_t pr_icmp_cksum; /* checksum field */ 460Sstevel@tonic-gate uint16_t pr_icmp_id; /* Identification */ 470Sstevel@tonic-gate uint16_t pr_icmp_seq; /* sequence number */ 480Sstevel@tonic-gate uint32_t pr_icmp_timestamp; /* Time stamp */ 490Sstevel@tonic-gate uint32_t pr_icmp_mtype; /* Message type */ 500Sstevel@tonic-gate }; 510Sstevel@tonic-gate 520Sstevel@tonic-gate static struct in6_addr all_nodes_mcast_v6 = { { 0xff, 0x2, 0x0, 0x0, 530Sstevel@tonic-gate 0x0, 0x0, 0x0, 0x0, 540Sstevel@tonic-gate 0x0, 0x0, 0x0, 0x0, 550Sstevel@tonic-gate 0x0, 0x0, 0x0, 0x1 } }; 560Sstevel@tonic-gate 570Sstevel@tonic-gate static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } }; 580Sstevel@tonic-gate 590Sstevel@tonic-gate static hrtime_t last_fdt_bumpup_time; /* When FDT was bumped up last */ 600Sstevel@tonic-gate 610Sstevel@tonic-gate static void *find_ancillary(struct msghdr *msg, int cmsg_type); 620Sstevel@tonic-gate static void pi_set_crtt(struct target *tg, int m, 630Sstevel@tonic-gate boolean_t is_probe_uni); 640Sstevel@tonic-gate static void incoming_echo_reply(struct phyint_instance *pii, 650Sstevel@tonic-gate struct pr_icmp *reply, struct in6_addr fromaddr); 660Sstevel@tonic-gate static void incoming_rtt_reply(struct phyint_instance *pii, 670Sstevel@tonic-gate struct pr_icmp *reply, struct in6_addr fromaddr); 680Sstevel@tonic-gate static void incoming_mcast_reply(struct phyint_instance *pii, 690Sstevel@tonic-gate struct pr_icmp *reply, struct in6_addr fromaddr); 700Sstevel@tonic-gate 710Sstevel@tonic-gate static boolean_t check_pg_crtt_improved(struct phyint_group *pg); 720Sstevel@tonic-gate static boolean_t check_pii_crtt_improved(struct phyint_instance *pii); 730Sstevel@tonic-gate static boolean_t check_exception_target(struct phyint_instance *pii, 740Sstevel@tonic-gate struct target *target); 750Sstevel@tonic-gate static void probe_fail_info(struct phyint_instance *pii, 760Sstevel@tonic-gate struct target *cur_tg, struct probe_fail_count *pfinfo); 770Sstevel@tonic-gate static void probe_success_info(struct phyint_instance *pii, 780Sstevel@tonic-gate struct target *cur_tg, struct probe_success_count *psinfo); 790Sstevel@tonic-gate static boolean_t phyint_repaired(struct phyint *pi); 800Sstevel@tonic-gate 810Sstevel@tonic-gate static int failover(struct phyint *from, struct phyint *to); 820Sstevel@tonic-gate static int failback(struct phyint *from, struct phyint *to); 830Sstevel@tonic-gate static struct phyint *get_failover_dst(struct phyint *pi, int failover_type); 840Sstevel@tonic-gate 850Sstevel@tonic-gate static boolean_t highest_ack_tg(uint16_t seq, struct target *tg); 860Sstevel@tonic-gate static int in_cksum(ushort_t *addr, int len); 870Sstevel@tonic-gate static void reset_snxt_basetimes(void); 880Sstevel@tonic-gate 890Sstevel@tonic-gate /* 900Sstevel@tonic-gate * CRTT - Conservative Round Trip Time Estimate 910Sstevel@tonic-gate * Probe success - A matching probe reply received before CRTT ms has elapsed 920Sstevel@tonic-gate * after sending the probe. 930Sstevel@tonic-gate * Probe failure - No probe reply received and more than CRTT ms has elapsed 940Sstevel@tonic-gate * after sending the probe. 950Sstevel@tonic-gate * 960Sstevel@tonic-gate * TLS - Time last success. Most recent probe ack received at this time. 970Sstevel@tonic-gate * TFF - Time first fail. The time of the earliest probe failure in 980Sstevel@tonic-gate * a consecutive series of probe failures. 990Sstevel@tonic-gate * NUM_PROBE_REPAIRS - Number of consecutive successful probes required 1000Sstevel@tonic-gate * before declaring phyint repair. 1010Sstevel@tonic-gate * NUM_PROBE_FAILS - Number of consecutive probe failures required to 1020Sstevel@tonic-gate * declare a phyint failure. 1030Sstevel@tonic-gate * 1040Sstevel@tonic-gate * Phyint state diagram 1050Sstevel@tonic-gate * 1060Sstevel@tonic-gate * The state of a phyint that is capable of being probed, is completely 1070Sstevel@tonic-gate * specified by the 5-tuple <pi_state, pg_groupfailed, I, pi_empty, pi_full>. 1080Sstevel@tonic-gate * 1090Sstevel@tonic-gate * A phyint starts in either PI_RUNNING or PI_FAILED, depending on the state 1100Sstevel@tonic-gate * of the link (according to the driver). If the phyint is also configured 1110Sstevel@tonic-gate * with a test address (the common case) and probe targets, then a phyint must 1120Sstevel@tonic-gate * also successfully be able to send and receive probes in order to remain in 1130Sstevel@tonic-gate * the PI_RUNNING state (otherwise, it transitions to PI_FAILED). 1140Sstevel@tonic-gate * 1150Sstevel@tonic-gate * Further, if a PI_RUNNING phyint is configured with a test address but is 1160Sstevel@tonic-gate * unable to find any probe targets, it will transition to the PI_NOTARGETS 1170Sstevel@tonic-gate * state, which indicates that the link is apparently functional but that 1180Sstevel@tonic-gate * in.mpathd is unable to send probes to verify functionality (in this case, 1190Sstevel@tonic-gate * in.mpathd makes the optimistic assumption that the interface is working 1200Sstevel@tonic-gate * correctly and thus does not perform a failover, but reports the interface 1210Sstevel@tonic-gate * as IPMP_IF_UNKNOWN through the async events and query interfaces). 1220Sstevel@tonic-gate * 1230Sstevel@tonic-gate * At any point, a phyint may be administratively marked offline via if_mpadm. 1240Sstevel@tonic-gate * In this case, the interface always transitions to PI_OFFLINE, regardless 1250Sstevel@tonic-gate * of its previous state. When the interface is later brought back online, 1260Sstevel@tonic-gate * in.mpathd acts as if the interface is new (and thus it transitions to 1270Sstevel@tonic-gate * PI_RUNNING or PI_FAILED based on the status of the link and the result of 1280Sstevel@tonic-gate * its probes, if probes are sent). 1290Sstevel@tonic-gate * 1300Sstevel@tonic-gate * pi_state - PI_RUNNING or PI_FAILED 1310Sstevel@tonic-gate * PI_RUNNING: The failure detection logic says the phyint is good. 1320Sstevel@tonic-gate * PI_FAILED: The failure detection logic says the phyint has failed. 1330Sstevel@tonic-gate * 1340Sstevel@tonic-gate * pg_groupfailed - Group failure, all interfaces in the group have failed. 1350Sstevel@tonic-gate * The pi_state may be either PI_FAILED or PI_NOTARGETS. 1360Sstevel@tonic-gate * In the case of router targets, we assume that the current list of 1370Sstevel@tonic-gate * targets obtained from the routing table, is still valid, so the 1380Sstevel@tonic-gate * phyint stat is PI_FAILED. In the case of host targets, we delete the 1390Sstevel@tonic-gate * list of targets, and multicast to the all hosts, to reconstruct the 1400Sstevel@tonic-gate * target list. So the phyints are in the PI_NOTARGETS state. 1410Sstevel@tonic-gate * 1420Sstevel@tonic-gate * I - value of (pi_flags & IFF_INACTIVE) 143704Sethindra * IFF_INACTIVE: No failovers have been done to this phyint, from 144704Sethindra * other phyints. This phyint is inactive. Phyint can be a Standby. 145704Sethindra * When failback has been disabled (FAILOVER=no configured), 146704Sethindra * phyint can also be a non-STANDBY. In this case IFF_INACTIVE 147704Sethindra * is set when phyint subsequently recovers after a failure. 1480Sstevel@tonic-gate * 1490Sstevel@tonic-gate * pi_empty 1500Sstevel@tonic-gate * This phyint has failed over successfully to another phyint, and 1510Sstevel@tonic-gate * this phyint is currently "empty". It does not host any addresses or 1520Sstevel@tonic-gate * multicast membership etc. This is the state of a phyint after a 1530Sstevel@tonic-gate * failover from the phyint has completed successfully and no subsequent 1540Sstevel@tonic-gate * 'failover to' or 'failback to' has occurred on the phyint. 1550Sstevel@tonic-gate * IP guarantees that no new logicals will be hosted nor any multicast 1560Sstevel@tonic-gate * joins permitted on the phyint, since the phyint is either failed or 1570Sstevel@tonic-gate * inactive. pi_empty is set implies the phyint is either failed or 1580Sstevel@tonic-gate * inactive. 1590Sstevel@tonic-gate * 1600Sstevel@tonic-gate * pi_full 1610Sstevel@tonic-gate * The phyint hosts all of its own addresses that it "owns". If the 1620Sstevel@tonic-gate * phyint was previously failed or inactive, failbacks to the phyint 1630Sstevel@tonic-gate * has completed successfully. i.e. No more failbacks to this phyint 1640Sstevel@tonic-gate * can produce any change in system state whatsoever. 1650Sstevel@tonic-gate * 1660Sstevel@tonic-gate * Not all 32 possible combinations of the above 5-tuple are possible. 1670Sstevel@tonic-gate * Furthermore some of the above combinations are transient. They may occur 1680Sstevel@tonic-gate * only because the failover or failback did not complete successfully. The 1690Sstevel@tonic-gate * failover/failback will be retried and eventually a stable state will be 1700Sstevel@tonic-gate * reached. 1710Sstevel@tonic-gate * 1720Sstevel@tonic-gate * I is tracked by IP. pi_state, pi_empty and pi_full are tracked by mpathd. 1730Sstevel@tonic-gate * The following are the state machines. 'from' and 'to' are the src and 1740Sstevel@tonic-gate * dst of the failover/failback, below 1750Sstevel@tonic-gate * 1760Sstevel@tonic-gate * pi_empty state machine 1770Sstevel@tonic-gate * --------------------------------------------------------------------------- 1780Sstevel@tonic-gate * Event State -> New State 1790Sstevel@tonic-gate * --------------------------------------------------------------------------- 1800Sstevel@tonic-gate * successful completion from.pi_empty = 0 -> from.pi_empty = 1 1810Sstevel@tonic-gate * of failover 1820Sstevel@tonic-gate * 1830Sstevel@tonic-gate * Initiate failover to.pi_empty = X -> to.pi_empty = 0 1840Sstevel@tonic-gate * 1850Sstevel@tonic-gate * Initiate failback to.pi_empty = X -> to.pi_empty = 0 1860Sstevel@tonic-gate * 1870Sstevel@tonic-gate * group failure pi_empty = X -> pi_empty = 0 1880Sstevel@tonic-gate * --------------------------------------------------------------------------- 1890Sstevel@tonic-gate * 1900Sstevel@tonic-gate * pi_full state machine 1910Sstevel@tonic-gate * --------------------------------------------------------------------------- 1920Sstevel@tonic-gate * Event State -> New State 1930Sstevel@tonic-gate * --------------------------------------------------------------------------- 1940Sstevel@tonic-gate * successful completion to.pi_full = 0 -> to.pi_full = 1 1950Sstevel@tonic-gate * of failback from 1960Sstevel@tonic-gate * each of the other phyints 1970Sstevel@tonic-gate * 1980Sstevel@tonic-gate * Initiate failover from.pi_full = X -> from.pi_full = 0 1990Sstevel@tonic-gate * 2000Sstevel@tonic-gate * group failure pi_full = X -> pi_full = 0 2010Sstevel@tonic-gate * --------------------------------------------------------------------------- 2020Sstevel@tonic-gate * 2030Sstevel@tonic-gate * pi_state state machine 2040Sstevel@tonic-gate * --------------------------------------------------------------------------- 2050Sstevel@tonic-gate * Event State New State 2060Sstevel@tonic-gate * Action: 2070Sstevel@tonic-gate * --------------------------------------------------------------------------- 2080Sstevel@tonic-gate * NIC failure (PI_RUNNING, I == 0) -> (PI_FAILED, I == 0) 2090Sstevel@tonic-gate * detection : set IFF_FAILED on this phyint 2100Sstevel@tonic-gate * : failover from this phyint to another 2110Sstevel@tonic-gate * 212704Sethindra * NIC failure (PI_RUNNING, I == 1) -> (PI_FAILED, I == 0) 2130Sstevel@tonic-gate * detection : set IFF_FAILED on this phyint 2140Sstevel@tonic-gate * 215704Sethindra * NIC repair (PI_FAILED, I == 0, FAILBACK=yes) 216704Sethindra * detection -> (PI_RUNNING, I == 0) 217704Sethindra * : to.pi_empty = 0 2180Sstevel@tonic-gate * : clear IFF_FAILED on this phyint 219704Sethindra * : failback to this phyint if enabled 2200Sstevel@tonic-gate * 221704Sethindra * NIC repair (PI_FAILED, I == 0, FAILBACK=no) 222704Sethindra * detection -> (PI_RUNNING, I == 1) 223704Sethindra * : to.pi_empty = 0 224704Sethindra * : clear IFF_FAILED on this phyint 225704Sethindra * : if failback is disabled set I == 1 2260Sstevel@tonic-gate * 2270Sstevel@tonic-gate * Group failure (perform on all phyints in the group) 2280Sstevel@tonic-gate * detection PI_RUNNING PI_FAILED 2290Sstevel@tonic-gate * (Router targets) : set IFF_FAILED 2300Sstevel@tonic-gate * : clear pi_empty and pi_full 2310Sstevel@tonic-gate * 2320Sstevel@tonic-gate * Group failure (perform on all phyints in the group) 2330Sstevel@tonic-gate * detection PI_RUNNING PI_NOTARGETS 2340Sstevel@tonic-gate * (Host targets) : set IFF_FAILED 2350Sstevel@tonic-gate * : clear pi_empty and pi_full 2360Sstevel@tonic-gate * : delete the target list on all phyints 2370Sstevel@tonic-gate * --------------------------------------------------------------------------- 2380Sstevel@tonic-gate * 2390Sstevel@tonic-gate * I state machine 2400Sstevel@tonic-gate * --------------------------------------------------------------------------- 2410Sstevel@tonic-gate * Event State Action: 2420Sstevel@tonic-gate * --------------------------------------------------------------------------- 243704Sethindra * Turn on I pi_empty == 0, STANDBY : failover from standby 2440Sstevel@tonic-gate * 245704Sethindra * Turn off I PI_RUNNING, STANDBY : pi_empty = 0 2460Sstevel@tonic-gate * pi_full == 0 : failback to this if enabled 2470Sstevel@tonic-gate * --------------------------------------------------------------------------- 2480Sstevel@tonic-gate * 2490Sstevel@tonic-gate * Assertions: (Read '==>' as implies) 2500Sstevel@tonic-gate * 2510Sstevel@tonic-gate * (pi_empty == 1) ==> (I == 1 || pi_state == PI_FAILED) 2520Sstevel@tonic-gate * (pi_empty == 1) ==> (pi_full == 0) 2530Sstevel@tonic-gate * (pi_full == 1) ==> (pi_empty == 0) 2540Sstevel@tonic-gate * 2550Sstevel@tonic-gate * Invariants 2560Sstevel@tonic-gate * 2570Sstevel@tonic-gate * pg_groupfailed = 0 && 258704Sethindra * 1. (I == 1, pi_empty == 0) ==> initiate failover from standby 2590Sstevel@tonic-gate * 2. (I == 0, PI_FAILED, pi_empty == 0) ==> initiate failover from phyint 2600Sstevel@tonic-gate * 3. (I == 0, PI_RUNNING, pi_full == 0) ==> initiate failback to phyint 2610Sstevel@tonic-gate * 2620Sstevel@tonic-gate * 1. says that an inactive standby, that is not empty, has to be failed 2630Sstevel@tonic-gate * over. For a standby to be truly inactive, it should not host any 2640Sstevel@tonic-gate * addresses. So we move them to some other phyint. Usually we catch the 2650Sstevel@tonic-gate * turn on of IFF_INACTIVE, and perform this action. However if the failover 2660Sstevel@tonic-gate * did not complete successfully, then subsequently we have lost the edge 2670Sstevel@tonic-gate * trigger, and this invariant kicks in and completes the action. 2680Sstevel@tonic-gate * 2690Sstevel@tonic-gate * 2. says that any failed phyint that is not empty must be failed over. 2700Sstevel@tonic-gate * Usually we do the failover when we detect NIC failure. However if the 2710Sstevel@tonic-gate * failover does not complete successfully, this invariant kicks in and 2720Sstevel@tonic-gate * completes the failover. We exclude inactive standby which is covered by 1. 2730Sstevel@tonic-gate * 2740Sstevel@tonic-gate * 3. says that any running phyint that is not full must be failed back. 2750Sstevel@tonic-gate * Usually we do the failback when we detect NIC repair. However if the 2760Sstevel@tonic-gate * failback does not complete successfully, this invariant kicks in and 2770Sstevel@tonic-gate * completes the failback. Note that we don't want to failback to an inactive 2780Sstevel@tonic-gate * standby. 2790Sstevel@tonic-gate * 2800Sstevel@tonic-gate * The invariants 1 - 3 and the actions are in initifs(). 2810Sstevel@tonic-gate */ 2820Sstevel@tonic-gate 2830Sstevel@tonic-gate struct probes_missed probes_missed; 2840Sstevel@tonic-gate 2850Sstevel@tonic-gate /* 2860Sstevel@tonic-gate * Compose and transmit an ICMP ECHO REQUEST packet. The IP header 2870Sstevel@tonic-gate * will be added on by the kernel. The id field identifies this phyint. 2880Sstevel@tonic-gate * and the sequence number is an increasing (modulo 2^^16) integer. The data 2890Sstevel@tonic-gate * portion holds the time value when the packet is sent. On echo this is 2900Sstevel@tonic-gate * extracted to compute the round-trip time. Three different types of 2910Sstevel@tonic-gate * probe packets are used. 2920Sstevel@tonic-gate * 2930Sstevel@tonic-gate * PROBE_UNI: This type is used to do failure detection / failure recovery 2940Sstevel@tonic-gate * and RTT calculation. PROBE_UNI probes are spaced apart in time, 2950Sstevel@tonic-gate * not less than the current CRTT. pii_probes[] stores data 2960Sstevel@tonic-gate * about these probes. These packets consume sequence number space. 2970Sstevel@tonic-gate * 2980Sstevel@tonic-gate * PROBE_RTT: This type is used to make only rtt measurments. Normally these 2990Sstevel@tonic-gate * are not used. Under heavy network load, the rtt may go up very high, 3000Sstevel@tonic-gate * due to a spike, or may appear to go high, due to extreme scheduling 3010Sstevel@tonic-gate * delays. Once the network stress is removed, mpathd takes long time to 3020Sstevel@tonic-gate * recover, because the probe_interval is already high, and it takes 3030Sstevel@tonic-gate * a long time to send out sufficient number of probes to bring down the 3040Sstevel@tonic-gate * rtt. To avoid this problem, PROBE_RTT probes are sent out every 3050Sstevel@tonic-gate * user_probe_interval ms. and will cause only rtt updates. These packets 3060Sstevel@tonic-gate * do not consume sequence number space nor is information about these 3070Sstevel@tonic-gate * packets stored in the pii_probes[] 3080Sstevel@tonic-gate * 3090Sstevel@tonic-gate * PROBE_MULTI: This type is only used to construct a list of targets, when 3100Sstevel@tonic-gate * no targets are known. The packet is multicast to the all hosts addr. 3110Sstevel@tonic-gate */ 3120Sstevel@tonic-gate static void 3130Sstevel@tonic-gate probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time) 3140Sstevel@tonic-gate { 3150Sstevel@tonic-gate struct pr_icmp probe_pkt; /* Probe packet */ 3160Sstevel@tonic-gate struct sockaddr_in6 whereto6; /* target address IPv6 */ 3170Sstevel@tonic-gate struct sockaddr_in whereto; /* target address IPv4 */ 3180Sstevel@tonic-gate int pr_ndx; /* probe index in pii->pii_probes[] */ 3190Sstevel@tonic-gate boolean_t sent = _B_TRUE; 3200Sstevel@tonic-gate 3210Sstevel@tonic-gate if (debug & D_TARGET) { 3220Sstevel@tonic-gate logdebug("probe(%s %s %d %u)\n", AF_STR(pii->pii_af), 3230Sstevel@tonic-gate pii->pii_name, probe_type, cur_time); 3240Sstevel@tonic-gate } 3250Sstevel@tonic-gate 3260Sstevel@tonic-gate assert(pii->pii_probe_sock != -1); 3270Sstevel@tonic-gate assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI || 3280Sstevel@tonic-gate probe_type == PROBE_RTT); 3290Sstevel@tonic-gate 3300Sstevel@tonic-gate probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ? 3310Sstevel@tonic-gate ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST; 3320Sstevel@tonic-gate probe_pkt.pr_icmp_code = 0; 3330Sstevel@tonic-gate probe_pkt.pr_icmp_cksum = 0; 3340Sstevel@tonic-gate probe_pkt.pr_icmp_seq = htons(pii->pii_snxt); 3350Sstevel@tonic-gate 3360Sstevel@tonic-gate /* 3370Sstevel@tonic-gate * Since there is no need to do arithmetic on the icmpid, 3380Sstevel@tonic-gate * (only equality check is done) pii_icmpid is stored in 3390Sstevel@tonic-gate * network byte order at initialization itself. 3400Sstevel@tonic-gate */ 3410Sstevel@tonic-gate probe_pkt.pr_icmp_id = pii->pii_icmpid; 3420Sstevel@tonic-gate probe_pkt.pr_icmp_timestamp = htonl(cur_time); 3430Sstevel@tonic-gate probe_pkt.pr_icmp_mtype = htonl(probe_type); 3440Sstevel@tonic-gate 3450Sstevel@tonic-gate /* 3460Sstevel@tonic-gate * If probe_type is PROBE_MULTI, this packet will be multicast to 3470Sstevel@tonic-gate * the all hosts address. Otherwise it is unicast to the next target. 3480Sstevel@tonic-gate */ 3490Sstevel@tonic-gate assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) && 3500Sstevel@tonic-gate pii->pii_rtt_target_next != NULL)); 3510Sstevel@tonic-gate 3520Sstevel@tonic-gate if (pii->pii_af == AF_INET6) { 3530Sstevel@tonic-gate bzero(&whereto6, sizeof (whereto6)); 3540Sstevel@tonic-gate whereto6.sin6_family = AF_INET6; 3550Sstevel@tonic-gate if (probe_type == PROBE_MULTI) { 3560Sstevel@tonic-gate whereto6.sin6_addr = all_nodes_mcast_v6; 3570Sstevel@tonic-gate } else if (probe_type == PROBE_UNI) { 3580Sstevel@tonic-gate whereto6.sin6_addr = pii->pii_target_next->tg_address; 3590Sstevel@tonic-gate } else { 3600Sstevel@tonic-gate /* type is PROBE_RTT */ 3610Sstevel@tonic-gate whereto6.sin6_addr = 3620Sstevel@tonic-gate pii->pii_rtt_target_next->tg_address; 3630Sstevel@tonic-gate } 3640Sstevel@tonic-gate if (sendto(pii->pii_probe_sock, (char *)&probe_pkt, 3650Sstevel@tonic-gate sizeof (probe_pkt), 0, (struct sockaddr *)&whereto6, 3660Sstevel@tonic-gate sizeof (whereto6)) != sizeof (probe_pkt)) { 3670Sstevel@tonic-gate logperror_pii(pii, "probe: probe sendto"); 3680Sstevel@tonic-gate sent = _B_FALSE; 3690Sstevel@tonic-gate } 3700Sstevel@tonic-gate } else { 3710Sstevel@tonic-gate bzero(&whereto, sizeof (whereto)); 3720Sstevel@tonic-gate whereto.sin_family = AF_INET; 3730Sstevel@tonic-gate if (probe_type == PROBE_MULTI) { 3740Sstevel@tonic-gate whereto.sin_addr = all_nodes_mcast_v4; 3750Sstevel@tonic-gate } else if (probe_type == PROBE_UNI) { 3760Sstevel@tonic-gate IN6_V4MAPPED_TO_INADDR( 3770Sstevel@tonic-gate &pii->pii_target_next->tg_address, 3780Sstevel@tonic-gate &whereto.sin_addr); 3790Sstevel@tonic-gate } else { 3800Sstevel@tonic-gate /* type is PROBE_RTT */ 3810Sstevel@tonic-gate IN6_V4MAPPED_TO_INADDR( 3820Sstevel@tonic-gate &pii->pii_rtt_target_next->tg_address, 3830Sstevel@tonic-gate &whereto.sin_addr); 3840Sstevel@tonic-gate } 3850Sstevel@tonic-gate 3860Sstevel@tonic-gate /* 3870Sstevel@tonic-gate * Compute the IPv4 icmp checksum. Does not cover the IP header. 3880Sstevel@tonic-gate */ 3890Sstevel@tonic-gate probe_pkt.pr_icmp_cksum = 3900Sstevel@tonic-gate in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt)); 3910Sstevel@tonic-gate if (sendto(pii->pii_probe_sock, (char *)&probe_pkt, 3920Sstevel@tonic-gate sizeof (probe_pkt), 0, (struct sockaddr *)&whereto, 3930Sstevel@tonic-gate sizeof (whereto)) != sizeof (probe_pkt)) { 3940Sstevel@tonic-gate logperror_pii(pii, "probe: probe sendto"); 3950Sstevel@tonic-gate sent = _B_FALSE; 3960Sstevel@tonic-gate } 3970Sstevel@tonic-gate } 3980Sstevel@tonic-gate 3990Sstevel@tonic-gate /* 4000Sstevel@tonic-gate * If this is a PROBE_UNI probe packet being unicast to a target, then 4010Sstevel@tonic-gate * update our tables. We will need this info in processing the probe 4020Sstevel@tonic-gate * response. PROBE_MULTI and PROBE_RTT packets are not used for 4030Sstevel@tonic-gate * the purpose of failure or recovery detection. PROBE_MULTI packets 4040Sstevel@tonic-gate * are only used to construct a list of targets. PROBE_RTT packets are 4050Sstevel@tonic-gate * used only for updating the rtt and not for failure detection. 4060Sstevel@tonic-gate */ 4070Sstevel@tonic-gate if (probe_type == PROBE_UNI && sent) { 4080Sstevel@tonic-gate pr_ndx = pii->pii_probe_next; 4090Sstevel@tonic-gate assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT); 4100Sstevel@tonic-gate 4110Sstevel@tonic-gate /* Collect statistics, before we reuse the last slot. */ 4120Sstevel@tonic-gate if (pii->pii_probes[pr_ndx].pr_status == PR_LOST) 4130Sstevel@tonic-gate pii->pii_cum_stats.lost++; 4140Sstevel@tonic-gate else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) 4150Sstevel@tonic-gate pii->pii_cum_stats.acked++; 4160Sstevel@tonic-gate pii->pii_cum_stats.sent++; 4170Sstevel@tonic-gate 4180Sstevel@tonic-gate pii->pii_probes[pr_ndx].pr_status = PR_UNACKED; 4190Sstevel@tonic-gate pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next; 4200Sstevel@tonic-gate pii->pii_probes[pr_ndx].pr_time_sent = cur_time; 4210Sstevel@tonic-gate pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next); 4220Sstevel@tonic-gate pii->pii_target_next = target_next(pii->pii_target_next); 4230Sstevel@tonic-gate assert(pii->pii_target_next != NULL); 4240Sstevel@tonic-gate /* 4250Sstevel@tonic-gate * If we have a single variable to denote the next target to 4260Sstevel@tonic-gate * probe for both rtt probes and failure detection probes, we 4270Sstevel@tonic-gate * could end up with a situation where the failure detection 4280Sstevel@tonic-gate * probe targets become disjoint from the rtt probe targets. 4290Sstevel@tonic-gate * Eg. if 2 targets and the actual fdt is double the user 4300Sstevel@tonic-gate * specified fdt. So we have 2 variables. In this scheme 4310Sstevel@tonic-gate * we also reset pii_rtt_target_next for every fdt probe, 4320Sstevel@tonic-gate * though that may not be necessary. 4330Sstevel@tonic-gate */ 4340Sstevel@tonic-gate pii->pii_rtt_target_next = pii->pii_target_next; 4350Sstevel@tonic-gate pii->pii_snxt++; 4360Sstevel@tonic-gate } else if (probe_type == PROBE_RTT) { 4370Sstevel@tonic-gate pii->pii_rtt_target_next = 4380Sstevel@tonic-gate target_next(pii->pii_rtt_target_next); 4390Sstevel@tonic-gate assert(pii->pii_rtt_target_next != NULL); 4400Sstevel@tonic-gate } 4410Sstevel@tonic-gate } 4420Sstevel@tonic-gate 4430Sstevel@tonic-gate /* 4440Sstevel@tonic-gate * Incoming IPv4 data from wire, is received here. Called from main. 4450Sstevel@tonic-gate */ 4460Sstevel@tonic-gate void 4470Sstevel@tonic-gate in_data(struct phyint_instance *pii) 4480Sstevel@tonic-gate { 4490Sstevel@tonic-gate struct sockaddr_in from; 4500Sstevel@tonic-gate struct in6_addr fromaddr; 4510Sstevel@tonic-gate uint_t fromlen; 4520Sstevel@tonic-gate static uint_t in_packet[(IP_MAXPACKET + 1)/4]; 4530Sstevel@tonic-gate struct ip *ip; 4540Sstevel@tonic-gate int iphlen; 4550Sstevel@tonic-gate int len; 4560Sstevel@tonic-gate char abuf[INET_ADDRSTRLEN]; 4570Sstevel@tonic-gate struct pr_icmp *reply; 4580Sstevel@tonic-gate 4590Sstevel@tonic-gate if (debug & D_PROBE) { 4600Sstevel@tonic-gate logdebug("in_data(%s %s)\n", 4610Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name); 4620Sstevel@tonic-gate } 4630Sstevel@tonic-gate 4640Sstevel@tonic-gate /* 4650Sstevel@tonic-gate * Poll has already told us that a message is waiting, 4660Sstevel@tonic-gate * on this socket. Read it now. We should not block. 4670Sstevel@tonic-gate */ 4680Sstevel@tonic-gate fromlen = sizeof (from); 4690Sstevel@tonic-gate len = recvfrom(pii->pii_probe_sock, (char *)in_packet, 4700Sstevel@tonic-gate sizeof (in_packet), 0, (struct sockaddr *)&from, &fromlen); 4710Sstevel@tonic-gate if (len < 0) { 4720Sstevel@tonic-gate logperror_pii(pii, "in_data: recvfrom"); 4730Sstevel@tonic-gate return; 4740Sstevel@tonic-gate } 4750Sstevel@tonic-gate 4760Sstevel@tonic-gate /* 4770Sstevel@tonic-gate * If the NIC has indicated the link is down, don't go 4780Sstevel@tonic-gate * any further. 4790Sstevel@tonic-gate */ 4800Sstevel@tonic-gate if (LINK_DOWN(pii->pii_phyint)) 4810Sstevel@tonic-gate return; 4820Sstevel@tonic-gate 4830Sstevel@tonic-gate /* Get the printable address for error reporting */ 4840Sstevel@tonic-gate (void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf)); 4850Sstevel@tonic-gate 4860Sstevel@tonic-gate /* Make sure packet contains at least minimum ICMP header */ 4870Sstevel@tonic-gate ip = (struct ip *)in_packet; 4880Sstevel@tonic-gate iphlen = ip->ip_hl << 2; 4890Sstevel@tonic-gate if (len < iphlen + ICMP_MINLEN) { 4900Sstevel@tonic-gate if (debug & D_PKTBAD) { 4910Sstevel@tonic-gate logdebug("in_data: packet too short (%d bytes)" 4920Sstevel@tonic-gate " from %s\n", len, abuf); 4930Sstevel@tonic-gate } 4940Sstevel@tonic-gate return; 4950Sstevel@tonic-gate } 4960Sstevel@tonic-gate 4970Sstevel@tonic-gate /* 4980Sstevel@tonic-gate * Subtract the IP hdr length, 'len' will be length of the probe 4990Sstevel@tonic-gate * reply, starting from the icmp hdr. 5000Sstevel@tonic-gate */ 5010Sstevel@tonic-gate len -= iphlen; 5020Sstevel@tonic-gate /* LINTED */ 5030Sstevel@tonic-gate reply = (struct pr_icmp *)((char *)in_packet + iphlen); 5040Sstevel@tonic-gate 5050Sstevel@tonic-gate /* Probe replies are icmp echo replies. Ignore anything else */ 5060Sstevel@tonic-gate if (reply->pr_icmp_type != ICMP_ECHO_REPLY) 5070Sstevel@tonic-gate return; 5080Sstevel@tonic-gate 5090Sstevel@tonic-gate /* 5100Sstevel@tonic-gate * The icmp id should match what we sent, which is stored 5110Sstevel@tonic-gate * in pi_icmpid. The icmp code for reply must be 0. 5120Sstevel@tonic-gate * The reply content must be a struct pr_icmp 5130Sstevel@tonic-gate */ 5140Sstevel@tonic-gate if (reply->pr_icmp_id != pii->pii_icmpid) { 5150Sstevel@tonic-gate /* Not in response to our probe */ 5160Sstevel@tonic-gate return; 5170Sstevel@tonic-gate } 5180Sstevel@tonic-gate 5190Sstevel@tonic-gate if (reply->pr_icmp_code != 0) { 5200Sstevel@tonic-gate logtrace("probe reply code %d from %s on %s\n", 5210Sstevel@tonic-gate reply->pr_icmp_code, abuf, pii->pii_name); 5220Sstevel@tonic-gate return; 5230Sstevel@tonic-gate } 5240Sstevel@tonic-gate 5250Sstevel@tonic-gate if (len < sizeof (struct pr_icmp)) { 5260Sstevel@tonic-gate logtrace("probe reply too short: %d bytes from %s on %s\n", 5270Sstevel@tonic-gate len, abuf, pii->pii_name); 5280Sstevel@tonic-gate return; 5290Sstevel@tonic-gate } 5300Sstevel@tonic-gate 5310Sstevel@tonic-gate IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr); 5320Sstevel@tonic-gate if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) 5330Sstevel@tonic-gate /* Unicast probe reply */ 5340Sstevel@tonic-gate incoming_echo_reply(pii, reply, fromaddr); 5350Sstevel@tonic-gate else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { 5360Sstevel@tonic-gate /* Multicast reply */ 5370Sstevel@tonic-gate incoming_mcast_reply(pii, reply, fromaddr); 5380Sstevel@tonic-gate } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) { 5390Sstevel@tonic-gate incoming_rtt_reply(pii, reply, fromaddr); 5400Sstevel@tonic-gate } else { 5410Sstevel@tonic-gate /* Probably not in response to our probe */ 5420Sstevel@tonic-gate logtrace("probe reply type: %d from %s on %s\n", 5430Sstevel@tonic-gate reply->pr_icmp_mtype, abuf, pii->pii_name); 5440Sstevel@tonic-gate return; 5450Sstevel@tonic-gate } 5460Sstevel@tonic-gate 5470Sstevel@tonic-gate } 5480Sstevel@tonic-gate 5490Sstevel@tonic-gate /* 5500Sstevel@tonic-gate * Incoming IPv6 data from wire is received here. Called from main. 5510Sstevel@tonic-gate */ 5520Sstevel@tonic-gate void 5530Sstevel@tonic-gate in6_data(struct phyint_instance *pii) 5540Sstevel@tonic-gate { 5550Sstevel@tonic-gate struct sockaddr_in6 from; 5560Sstevel@tonic-gate static uint64_t in_packet[(IP_MAXPACKET + 1)/8]; 5570Sstevel@tonic-gate static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8]; 5580Sstevel@tonic-gate int len; 5590Sstevel@tonic-gate char abuf[INET6_ADDRSTRLEN]; 5600Sstevel@tonic-gate struct msghdr msg; 5610Sstevel@tonic-gate struct iovec iov; 5620Sstevel@tonic-gate uchar_t *opt; 5630Sstevel@tonic-gate struct pr_icmp *reply; 5640Sstevel@tonic-gate 5650Sstevel@tonic-gate if (debug & D_PROBE) { 5660Sstevel@tonic-gate logdebug("in6_data(%s %s)\n", 5670Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name); 5680Sstevel@tonic-gate } 5690Sstevel@tonic-gate 5700Sstevel@tonic-gate iov.iov_base = (char *)in_packet; 5710Sstevel@tonic-gate iov.iov_len = sizeof (in_packet); 5720Sstevel@tonic-gate msg.msg_iov = &iov; 5730Sstevel@tonic-gate msg.msg_iovlen = 1; 5740Sstevel@tonic-gate msg.msg_name = (struct sockaddr *)&from; 5750Sstevel@tonic-gate msg.msg_namelen = sizeof (from); 5760Sstevel@tonic-gate msg.msg_control = ancillary_data; 5770Sstevel@tonic-gate msg.msg_controllen = sizeof (ancillary_data); 5780Sstevel@tonic-gate 5790Sstevel@tonic-gate if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) { 5800Sstevel@tonic-gate logperror_pii(pii, "in6_data: recvfrom"); 5810Sstevel@tonic-gate return; 5820Sstevel@tonic-gate } 5830Sstevel@tonic-gate 5840Sstevel@tonic-gate /* 5850Sstevel@tonic-gate * If the NIC has indicated that the link is down, don't go 5860Sstevel@tonic-gate * any further. 5870Sstevel@tonic-gate */ 5880Sstevel@tonic-gate if (LINK_DOWN(pii->pii_phyint)) 5890Sstevel@tonic-gate return; 5900Sstevel@tonic-gate 5910Sstevel@tonic-gate /* Get the printable address for error reporting */ 5920Sstevel@tonic-gate (void) inet_ntop(AF_INET6, &from.sin6_addr, abuf, sizeof (abuf)); 5930Sstevel@tonic-gate if (len < ICMP_MINLEN) { 5940Sstevel@tonic-gate if (debug & D_PKTBAD) { 5950Sstevel@tonic-gate logdebug("Truncated message: msg_flags 0x%x from %s\n", 5960Sstevel@tonic-gate msg.msg_flags, abuf); 5970Sstevel@tonic-gate } 5980Sstevel@tonic-gate return; 5990Sstevel@tonic-gate } 6000Sstevel@tonic-gate /* Ignore packets > 64k or control buffers that don't fit */ 6010Sstevel@tonic-gate if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) { 6020Sstevel@tonic-gate if (debug & D_PKTBAD) { 6030Sstevel@tonic-gate logdebug("Truncated message: msg_flags 0x%x from %s\n", 6040Sstevel@tonic-gate msg.msg_flags, abuf); 6050Sstevel@tonic-gate } 6060Sstevel@tonic-gate return; 6070Sstevel@tonic-gate } 6080Sstevel@tonic-gate 6090Sstevel@tonic-gate reply = (struct pr_icmp *)in_packet; 6100Sstevel@tonic-gate if (reply->pr_icmp_type != ICMP6_ECHO_REPLY) 6110Sstevel@tonic-gate return; 6120Sstevel@tonic-gate 6130Sstevel@tonic-gate if (reply->pr_icmp_id != pii->pii_icmpid) { 6140Sstevel@tonic-gate /* Not in response to our probe */ 6150Sstevel@tonic-gate return; 6160Sstevel@tonic-gate } 6170Sstevel@tonic-gate 6180Sstevel@tonic-gate /* 6190Sstevel@tonic-gate * The kernel has already verified the the ICMP checksum. 6200Sstevel@tonic-gate */ 6210Sstevel@tonic-gate if (!IN6_IS_ADDR_LINKLOCAL(&from.sin6_addr)) { 6220Sstevel@tonic-gate logtrace("ICMPv6 echo reply source address not linklocal from " 6230Sstevel@tonic-gate "%s on %s\n", abuf, pii->pii_name); 6240Sstevel@tonic-gate return; 6250Sstevel@tonic-gate } 6260Sstevel@tonic-gate opt = find_ancillary(&msg, IPV6_RTHDR); 6270Sstevel@tonic-gate if (opt != NULL) { 6280Sstevel@tonic-gate /* Can't allow routing headers in probe replies */ 6290Sstevel@tonic-gate logtrace("message with routing header from %s on %s\n", 6300Sstevel@tonic-gate abuf, pii->pii_name); 6310Sstevel@tonic-gate return; 6320Sstevel@tonic-gate } 6330Sstevel@tonic-gate if (reply->pr_icmp_code != 0) { 6340Sstevel@tonic-gate logtrace("probe reply code: %d from %s on %s\n", 6350Sstevel@tonic-gate reply->pr_icmp_code, abuf, pii->pii_name); 6360Sstevel@tonic-gate return; 6370Sstevel@tonic-gate } 6380Sstevel@tonic-gate if (len < (sizeof (struct pr_icmp))) { 6390Sstevel@tonic-gate logtrace("probe reply too short: %d bytes from %s on %s\n", 6400Sstevel@tonic-gate len, abuf, pii->pii_name); 6410Sstevel@tonic-gate return; 6420Sstevel@tonic-gate } 6430Sstevel@tonic-gate if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) { 6440Sstevel@tonic-gate incoming_echo_reply(pii, reply, from.sin6_addr); 6450Sstevel@tonic-gate } else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { 6460Sstevel@tonic-gate incoming_mcast_reply(pii, reply, from.sin6_addr); 6470Sstevel@tonic-gate } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) { 6480Sstevel@tonic-gate incoming_rtt_reply(pii, reply, from.sin6_addr); 6490Sstevel@tonic-gate } else { 6500Sstevel@tonic-gate /* Probably not in response to our probe */ 6510Sstevel@tonic-gate logtrace("probe reply type: %d from %s on %s\n", 6520Sstevel@tonic-gate reply->pr_icmp_mtype, abuf, pii->pii_name); 6530Sstevel@tonic-gate } 6540Sstevel@tonic-gate } 6550Sstevel@tonic-gate 6560Sstevel@tonic-gate /* 6570Sstevel@tonic-gate * Process the incoming rtt reply, in response to our rtt probe. 6580Sstevel@tonic-gate * Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't 6590Sstevel@tonic-gate * have any stored information about the probe we sent. So we don't log 6600Sstevel@tonic-gate * any errors if we receive bad replies. 6610Sstevel@tonic-gate */ 6620Sstevel@tonic-gate static void 6630Sstevel@tonic-gate incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply, 6640Sstevel@tonic-gate struct in6_addr fromaddr) 6650Sstevel@tonic-gate { 6660Sstevel@tonic-gate int m; /* rtt measurment in ms */ 6670Sstevel@tonic-gate uint32_t cur_time; /* in ms from some arbitrary point */ 6680Sstevel@tonic-gate char abuf[INET6_ADDRSTRLEN]; 6690Sstevel@tonic-gate struct target *target; 6700Sstevel@tonic-gate uint32_t pr_icmp_timestamp; 6710Sstevel@tonic-gate struct phyint_group *pg; 6720Sstevel@tonic-gate 6730Sstevel@tonic-gate /* Get the printable address for error reporting */ 6740Sstevel@tonic-gate (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)); 6750Sstevel@tonic-gate 6760Sstevel@tonic-gate if (debug & D_PROBE) { 6770Sstevel@tonic-gate logdebug("incoming_rtt_reply: %s %s %s\n", 6780Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name, abuf); 6790Sstevel@tonic-gate } 6800Sstevel@tonic-gate 6810Sstevel@tonic-gate /* Do we know this target ? */ 6820Sstevel@tonic-gate target = target_lookup(pii, fromaddr); 6830Sstevel@tonic-gate if (target == NULL) 6840Sstevel@tonic-gate return; 6850Sstevel@tonic-gate 6860Sstevel@tonic-gate pr_icmp_timestamp = ntohl(reply->pr_icmp_timestamp); 6870Sstevel@tonic-gate cur_time = getcurrenttime(); 6880Sstevel@tonic-gate m = (int)(cur_time - pr_icmp_timestamp); 6890Sstevel@tonic-gate 6900Sstevel@tonic-gate /* Invalid rtt. It has wrapped around */ 6910Sstevel@tonic-gate if (m < 0) 6920Sstevel@tonic-gate return; 6930Sstevel@tonic-gate 6940Sstevel@tonic-gate /* 6950Sstevel@tonic-gate * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses 6960Sstevel@tonic-gate * The initial few responses after the interface is repaired may 6970Sstevel@tonic-gate * contain high rtt's because they could have been queued up waiting 6980Sstevel@tonic-gate * for ARP/NDP resolution on a failed interface. 6990Sstevel@tonic-gate */ 7000Sstevel@tonic-gate pg = pii->pii_phyint->pi_group; 7010Sstevel@tonic-gate if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg)) 7020Sstevel@tonic-gate return; 7030Sstevel@tonic-gate 7040Sstevel@tonic-gate /* 7050Sstevel@tonic-gate * Update rtt only if the new rtt is lower than the current rtt. 7060Sstevel@tonic-gate * (specified by the 3rd parameter to pi_set_crtt). 7070Sstevel@tonic-gate * If a spike has caused the current probe_interval to be > 7080Sstevel@tonic-gate * user_probe_interval, then this mechanism is used to bring down 7090Sstevel@tonic-gate * the rtt rapidly once the network stress is removed. 7100Sstevel@tonic-gate * If the new rtt is higher than the current rtt, we don't want to 7110Sstevel@tonic-gate * update the rtt. We are having more than 1 outstanding probe and 7120Sstevel@tonic-gate * the increase in rtt we are seeing is being unnecessarily weighted 7130Sstevel@tonic-gate * many times. The regular rtt update will be handled by 7140Sstevel@tonic-gate * incoming_echo_reply() and will take care of any rtt increase. 7150Sstevel@tonic-gate */ 7160Sstevel@tonic-gate pi_set_crtt(target, m, _B_FALSE); 7170Sstevel@tonic-gate if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) && 7180Sstevel@tonic-gate (user_failure_detection_time < pg->pg_fdt) && 7190Sstevel@tonic-gate (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) { 7200Sstevel@tonic-gate /* 7210Sstevel@tonic-gate * If the crtt has now dropped by a factor of LOWER_FT_TRIGGER, 7220Sstevel@tonic-gate * investigate if we can improve the failure detection time to 7230Sstevel@tonic-gate * meet whatever the user specified. 7240Sstevel@tonic-gate */ 7250Sstevel@tonic-gate if (check_pg_crtt_improved(pg)) { 7260Sstevel@tonic-gate pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE, 7270Sstevel@tonic-gate user_failure_detection_time); 7280Sstevel@tonic-gate pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2); 7290Sstevel@tonic-gate if (pii->pii_phyint->pi_group != phyint_anongroup) { 7300Sstevel@tonic-gate logerr("Improved failure detection time %d ms " 7310Sstevel@tonic-gate "on (%s %s) for group \"%s\"\n", 7320Sstevel@tonic-gate pg->pg_fdt, AF_STR(pii->pii_af), 7330Sstevel@tonic-gate pii->pii_name, 7340Sstevel@tonic-gate pii->pii_phyint->pi_group->pg_name); 7350Sstevel@tonic-gate } 7360Sstevel@tonic-gate if (user_failure_detection_time == pg->pg_fdt) { 7370Sstevel@tonic-gate /* Avoid any truncation or rounding errors */ 7380Sstevel@tonic-gate pg->pg_probeint = user_probe_interval; 7390Sstevel@tonic-gate /* 7400Sstevel@tonic-gate * No more rtt probes will be sent. The actual 7410Sstevel@tonic-gate * fdt has dropped to the user specified value. 7420Sstevel@tonic-gate * pii_fd_snxt_basetime and pii_snxt_basetime 7430Sstevel@tonic-gate * will be in sync henceforth. 7440Sstevel@tonic-gate */ 7450Sstevel@tonic-gate reset_snxt_basetimes(); 7460Sstevel@tonic-gate } 7470Sstevel@tonic-gate } 7480Sstevel@tonic-gate } 7490Sstevel@tonic-gate } 7500Sstevel@tonic-gate 7510Sstevel@tonic-gate /* 7520Sstevel@tonic-gate * Process the incoming echo reply, in response to our unicast probe. 7530Sstevel@tonic-gate * Common for both IPv4 and IPv6 7540Sstevel@tonic-gate */ 7550Sstevel@tonic-gate static void 7560Sstevel@tonic-gate incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply, 7570Sstevel@tonic-gate struct in6_addr fromaddr) 7580Sstevel@tonic-gate { 7590Sstevel@tonic-gate int m; /* rtt measurment in ms */ 7600Sstevel@tonic-gate uint32_t cur_time; /* in ms from some arbitrary point */ 7610Sstevel@tonic-gate char abuf[INET6_ADDRSTRLEN]; 7620Sstevel@tonic-gate int pr_ndx; 7630Sstevel@tonic-gate struct target *target; 7640Sstevel@tonic-gate boolean_t exception; 7650Sstevel@tonic-gate uint32_t pr_icmp_timestamp; 7660Sstevel@tonic-gate uint16_t pr_icmp_seq; 7670Sstevel@tonic-gate struct phyint_group *pg = pii->pii_phyint->pi_group; 7680Sstevel@tonic-gate 7690Sstevel@tonic-gate /* Get the printable address for error reporting */ 7700Sstevel@tonic-gate (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)); 7710Sstevel@tonic-gate 7720Sstevel@tonic-gate if (debug & D_PROBE) { 7730Sstevel@tonic-gate logdebug("incoming_echo_reply: %s %s %s seq %u\n", 7740Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name, abuf, 7750Sstevel@tonic-gate ntohs(reply->pr_icmp_seq)); 7760Sstevel@tonic-gate } 7770Sstevel@tonic-gate 7780Sstevel@tonic-gate pr_icmp_timestamp = ntohl(reply->pr_icmp_timestamp); 7790Sstevel@tonic-gate pr_icmp_seq = ntohs(reply->pr_icmp_seq); 7800Sstevel@tonic-gate 7810Sstevel@tonic-gate /* Reject out of window probe replies */ 7820Sstevel@tonic-gate if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) || 7830Sstevel@tonic-gate SEQ_LT(pr_icmp_seq, pii->pii_snxt - PROBE_STATS_COUNT)) { 7840Sstevel@tonic-gate logtrace("out of window probe seq %u snxt %u on %s from %s\n", 7850Sstevel@tonic-gate pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 7860Sstevel@tonic-gate pii->pii_cum_stats.unknown++; 7870Sstevel@tonic-gate return; 7880Sstevel@tonic-gate } 7890Sstevel@tonic-gate cur_time = getcurrenttime(); 7900Sstevel@tonic-gate m = (int)(cur_time - pr_icmp_timestamp); 7910Sstevel@tonic-gate if (m < 0) { 7920Sstevel@tonic-gate /* 7930Sstevel@tonic-gate * This is a ridiculously high value of rtt. rtt has wrapped 7940Sstevel@tonic-gate * around. Log a message, and ignore the rtt. 7950Sstevel@tonic-gate */ 7960Sstevel@tonic-gate logerr("incoming_echo_reply: rtt wraparound cur_time %u reply " 7970Sstevel@tonic-gate "timestamp %u\n", cur_time, pr_icmp_timestamp); 7980Sstevel@tonic-gate } 7990Sstevel@tonic-gate 8000Sstevel@tonic-gate /* 8010Sstevel@tonic-gate * Get the probe index pr_ndx corresponding to the received icmp seq. 8020Sstevel@tonic-gate * number in our pii->pii_probes[] array. The icmp sequence number 8030Sstevel@tonic-gate * pii_snxt corresponds to the probe index pii->pii_probe_next 8040Sstevel@tonic-gate */ 8050Sstevel@tonic-gate pr_ndx = MOD_SUB(pii->pii_probe_next, 8060Sstevel@tonic-gate (uint16_t)(pii->pii_snxt - pr_icmp_seq), PROBE_STATS_COUNT); 8070Sstevel@tonic-gate 8080Sstevel@tonic-gate assert(PR_STATUS_VALID(pii->pii_probes[pr_ndx].pr_status)); 8090Sstevel@tonic-gate 8100Sstevel@tonic-gate target = pii->pii_probes[pr_ndx].pr_target; 8110Sstevel@tonic-gate 8120Sstevel@tonic-gate /* 8130Sstevel@tonic-gate * Perform sanity checks, whether this probe reply that we 8140Sstevel@tonic-gate * have received is genuine 8150Sstevel@tonic-gate */ 8160Sstevel@tonic-gate if (target != NULL) { 8170Sstevel@tonic-gate /* 8180Sstevel@tonic-gate * Compare the src. addr of the received ICMP or ICMPv6 8190Sstevel@tonic-gate * probe reply with the target address in our tables. 8200Sstevel@tonic-gate */ 8210Sstevel@tonic-gate if (!IN6_ARE_ADDR_EQUAL(&target->tg_address, &fromaddr)) { 8220Sstevel@tonic-gate /* 8230Sstevel@tonic-gate * We don't have any record of having sent a probe to 8240Sstevel@tonic-gate * this target. This is a fake probe reply. Log an error 8250Sstevel@tonic-gate */ 8260Sstevel@tonic-gate logtrace("probe status %d Fake probe reply seq %u " 8270Sstevel@tonic-gate "snxt %u on %s from %s\n", 8280Sstevel@tonic-gate pii->pii_probes[pr_ndx].pr_status, 8290Sstevel@tonic-gate pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 8300Sstevel@tonic-gate pii->pii_cum_stats.unknown++; 8310Sstevel@tonic-gate return; 8320Sstevel@tonic-gate } else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) { 8330Sstevel@tonic-gate /* 8340Sstevel@tonic-gate * The address matches, but our tables indicate that 8350Sstevel@tonic-gate * this probe reply has been acked already. So this 8360Sstevel@tonic-gate * is a duplicate probe reply. Log an error 8370Sstevel@tonic-gate */ 8380Sstevel@tonic-gate logtrace("probe status %d Duplicate probe reply seq %u " 8390Sstevel@tonic-gate "snxt %u on %s from %s\n", 8400Sstevel@tonic-gate pii->pii_probes[pr_ndx].pr_status, 8410Sstevel@tonic-gate pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 8420Sstevel@tonic-gate pii->pii_cum_stats.unknown++; 8430Sstevel@tonic-gate return; 8440Sstevel@tonic-gate } 8450Sstevel@tonic-gate } else { 8460Sstevel@tonic-gate /* 8470Sstevel@tonic-gate * Target must not be NULL in the PR_UNACKED state 8480Sstevel@tonic-gate */ 8490Sstevel@tonic-gate assert(pii->pii_probes[pr_ndx].pr_status != PR_UNACKED); 8500Sstevel@tonic-gate if (pii->pii_probes[pr_ndx].pr_status == PR_UNUSED) { 8510Sstevel@tonic-gate /* 8520Sstevel@tonic-gate * The probe stats slot is unused. So we didn't 8530Sstevel@tonic-gate * send out any probe to this target. This is a fake. 8540Sstevel@tonic-gate * Log an error. 8550Sstevel@tonic-gate */ 8560Sstevel@tonic-gate logtrace("probe status %d Fake probe reply seq %u " 8570Sstevel@tonic-gate "snxt %u on %s from %s\n", 8580Sstevel@tonic-gate pii->pii_probes[pr_ndx].pr_status, 8590Sstevel@tonic-gate pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 8600Sstevel@tonic-gate } 8610Sstevel@tonic-gate pii->pii_cum_stats.unknown++; 8620Sstevel@tonic-gate return; 8630Sstevel@tonic-gate } 8640Sstevel@tonic-gate 8650Sstevel@tonic-gate /* 8660Sstevel@tonic-gate * If the rtt does not appear to be right, don't update the 8670Sstevel@tonic-gate * rtt stats. This can happen if the system dropped into the 8680Sstevel@tonic-gate * debugger, or the system was hung or too busy for a 8690Sstevel@tonic-gate * substantial time that we didn't get a chance to run. 8700Sstevel@tonic-gate */ 8710Sstevel@tonic-gate if ((m < 0) || (m > PROBE_STATS_COUNT * pg->pg_probeint)) { 8720Sstevel@tonic-gate /* 8730Sstevel@tonic-gate * If the probe corresponding to this receieved response 8740Sstevel@tonic-gate * was truly sent 'm' ms. ago, then this response must 8750Sstevel@tonic-gate * have been rejected by the sequence number checks. The 8760Sstevel@tonic-gate * fact that it has passed the sequence number checks 8770Sstevel@tonic-gate * means that the measured rtt is wrong. We were probably 8780Sstevel@tonic-gate * scheduled long after the packet was received. 8790Sstevel@tonic-gate */ 8800Sstevel@tonic-gate goto out; 8810Sstevel@tonic-gate } 8820Sstevel@tonic-gate 8830Sstevel@tonic-gate /* 8840Sstevel@tonic-gate * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses 8850Sstevel@tonic-gate * The initial few responses after the interface is repaired may 8860Sstevel@tonic-gate * contain high rtt's because they could have been queued up waiting 8870Sstevel@tonic-gate * for ARP/NDP resolution on a failed interface. 8880Sstevel@tonic-gate */ 8890Sstevel@tonic-gate if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg)) 8900Sstevel@tonic-gate goto out; 8910Sstevel@tonic-gate 8920Sstevel@tonic-gate /* 8930Sstevel@tonic-gate * Don't update the Conservative Round Trip Time estimate for this 8940Sstevel@tonic-gate * (phint, target) pair if this is the not the highest ack seq seen 8950Sstevel@tonic-gate * thus far on this target. 8960Sstevel@tonic-gate */ 8970Sstevel@tonic-gate if (!highest_ack_tg(pr_icmp_seq, target)) 8980Sstevel@tonic-gate goto out; 8990Sstevel@tonic-gate 9000Sstevel@tonic-gate /* 9010Sstevel@tonic-gate * Always update the rtt. This is a failure detection probe 9020Sstevel@tonic-gate * and we want to measure both increase / decrease in rtt. 9030Sstevel@tonic-gate */ 9040Sstevel@tonic-gate pi_set_crtt(target, m, _B_TRUE); 9050Sstevel@tonic-gate 9060Sstevel@tonic-gate /* 9070Sstevel@tonic-gate * If the crtt exceeds the average time between probes, 9080Sstevel@tonic-gate * investigate if this slow target is an exception. If so we 9090Sstevel@tonic-gate * can avoid this target and still meet the failure detection 9100Sstevel@tonic-gate * time. Otherwise we can't meet the failure detection time. 9110Sstevel@tonic-gate */ 9120Sstevel@tonic-gate if (target->tg_crtt > pg->pg_probeint) { 9130Sstevel@tonic-gate exception = check_exception_target(pii, target); 9140Sstevel@tonic-gate if (exception) { 9150Sstevel@tonic-gate /* 9160Sstevel@tonic-gate * This target is exceptionally slow. Don't use it 9170Sstevel@tonic-gate * for future probes. check_exception_target() has 9180Sstevel@tonic-gate * made sure that we have at least MIN_PROBE_TARGETS 9190Sstevel@tonic-gate * other active targets 9200Sstevel@tonic-gate */ 9210Sstevel@tonic-gate if (pii->pii_targets_are_routers) { 9220Sstevel@tonic-gate /* 9230Sstevel@tonic-gate * This is a slow router, mark it as slow 9240Sstevel@tonic-gate * and don't use it for further probes. We 9250Sstevel@tonic-gate * don't delete it, since it will be populated 9260Sstevel@tonic-gate * again when we do a router scan. Hence we 9270Sstevel@tonic-gate * need to maintain extra state (unlike the 9280Sstevel@tonic-gate * host case below). Mark it as TG_SLOW. 9290Sstevel@tonic-gate */ 9300Sstevel@tonic-gate if (target->tg_status == TG_ACTIVE) 9310Sstevel@tonic-gate pii->pii_ntargets--; 9320Sstevel@tonic-gate target->tg_status = TG_SLOW; 9330Sstevel@tonic-gate target->tg_latime = gethrtime(); 9340Sstevel@tonic-gate target->tg_rtt_sa = -1; 9350Sstevel@tonic-gate target->tg_crtt = 0; 9360Sstevel@tonic-gate target->tg_rtt_sd = 0; 9370Sstevel@tonic-gate if (pii->pii_target_next == target) { 9380Sstevel@tonic-gate pii->pii_target_next = 9390Sstevel@tonic-gate target_next(target); 9400Sstevel@tonic-gate } 9410Sstevel@tonic-gate } else { 9420Sstevel@tonic-gate /* 9430Sstevel@tonic-gate * the slow target is not a router, we can 9440Sstevel@tonic-gate * just delete it. Send an icmp multicast and 9450Sstevel@tonic-gate * pick the fastest responder that is not 9460Sstevel@tonic-gate * already an active target. target_delete() 9470Sstevel@tonic-gate * adjusts pii->pii_target_next 9480Sstevel@tonic-gate */ 9490Sstevel@tonic-gate target_delete(target); 9500Sstevel@tonic-gate probe(pii, PROBE_MULTI, cur_time); 9510Sstevel@tonic-gate } 9520Sstevel@tonic-gate } else { 9530Sstevel@tonic-gate /* 9540Sstevel@tonic-gate * We can't meet the failure detection time. 9550Sstevel@tonic-gate * Log a message, and update the detection time to 9560Sstevel@tonic-gate * whatever we can achieve. 9570Sstevel@tonic-gate */ 9580Sstevel@tonic-gate pg->pg_probeint = target->tg_crtt * NEXT_FDT_MULTIPLE; 9590Sstevel@tonic-gate pg->pg_fdt = pg->pg_probeint * (NUM_PROBE_FAILS + 2); 9600Sstevel@tonic-gate last_fdt_bumpup_time = gethrtime(); 9610Sstevel@tonic-gate if (pg != phyint_anongroup) { 9620Sstevel@tonic-gate logerr("Cannot meet requested failure detection" 9630Sstevel@tonic-gate " time of %d ms on (%s %s) new failure" 9640Sstevel@tonic-gate " detection time for group \"%s\" is %d" 9650Sstevel@tonic-gate " ms\n", user_failure_detection_time, 9660Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name, 9670Sstevel@tonic-gate pg->pg_name, pg->pg_fdt); 9680Sstevel@tonic-gate } 9690Sstevel@tonic-gate } 9700Sstevel@tonic-gate } else if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) && 9710Sstevel@tonic-gate (user_failure_detection_time < pg->pg_fdt) && 9720Sstevel@tonic-gate (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) { 9730Sstevel@tonic-gate /* 9740Sstevel@tonic-gate * If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER 9750Sstevel@tonic-gate * investigate if we can improve the failure detection time to 9760Sstevel@tonic-gate * meet whatever the user specified. 9770Sstevel@tonic-gate */ 9780Sstevel@tonic-gate if (check_pg_crtt_improved(pg)) { 9790Sstevel@tonic-gate pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE, 9800Sstevel@tonic-gate user_failure_detection_time); 9810Sstevel@tonic-gate pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2); 9820Sstevel@tonic-gate if (pg != phyint_anongroup) { 9830Sstevel@tonic-gate logerr("Improved failure detection time %d ms " 9840Sstevel@tonic-gate "on (%s %s) for group \"%s\"\n", pg->pg_fdt, 9850Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name, 9860Sstevel@tonic-gate pg->pg_name); 9870Sstevel@tonic-gate } 9880Sstevel@tonic-gate if (user_failure_detection_time == pg->pg_fdt) { 9890Sstevel@tonic-gate /* Avoid any truncation or rounding errors */ 9900Sstevel@tonic-gate pg->pg_probeint = user_probe_interval; 9910Sstevel@tonic-gate /* 9920Sstevel@tonic-gate * No more rtt probes will be sent. The actual 9930Sstevel@tonic-gate * fdt has dropped to the user specified value. 9940Sstevel@tonic-gate * pii_fd_snxt_basetime and pii_snxt_basetime 9950Sstevel@tonic-gate * will be in sync henceforth. 9960Sstevel@tonic-gate */ 9970Sstevel@tonic-gate reset_snxt_basetimes(); 9980Sstevel@tonic-gate } 9990Sstevel@tonic-gate } 10000Sstevel@tonic-gate } 10010Sstevel@tonic-gate out: 10020Sstevel@tonic-gate pii->pii_probes[pr_ndx].pr_status = PR_ACKED; 10030Sstevel@tonic-gate pii->pii_probes[pr_ndx].pr_time_acked = cur_time; 10040Sstevel@tonic-gate 10050Sstevel@tonic-gate /* 10060Sstevel@tonic-gate * Update pii->pii_rack, i.e. the sequence number of the last received 10070Sstevel@tonic-gate * probe response, based on the echo reply we have received now, if 10080Sstevel@tonic-gate * either of the following conditions are satisfied. 10090Sstevel@tonic-gate * a. pii_rack is outside the current receive window of 10100Sstevel@tonic-gate * [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt). 10110Sstevel@tonic-gate * This means we have not received probe responses for a 10120Sstevel@tonic-gate * long time, and the sequence number has wrapped around. 10130Sstevel@tonic-gate * b. pii_rack is within the current receive window and this echo 10140Sstevel@tonic-gate * reply corresponds to the highest sequence number we have seen 10150Sstevel@tonic-gate * so far. 10160Sstevel@tonic-gate */ 10170Sstevel@tonic-gate if (SEQ_GE(pii->pii_rack, pii->pii_snxt) || 10180Sstevel@tonic-gate SEQ_LT(pii->pii_rack, pii->pii_snxt - PROBE_STATS_COUNT) || 10190Sstevel@tonic-gate SEQ_GT(pr_icmp_seq, pii->pii_rack)) { 10200Sstevel@tonic-gate pii->pii_rack = pr_icmp_seq; 10210Sstevel@tonic-gate } 10220Sstevel@tonic-gate } 10230Sstevel@tonic-gate 10240Sstevel@tonic-gate /* 10250Sstevel@tonic-gate * Returns true if seq is the highest unacknowledged seq for target tg 10260Sstevel@tonic-gate * else returns false 10270Sstevel@tonic-gate */ 10280Sstevel@tonic-gate static boolean_t 10290Sstevel@tonic-gate highest_ack_tg(uint16_t seq, struct target *tg) 10300Sstevel@tonic-gate { 10310Sstevel@tonic-gate struct phyint_instance *pii; 10320Sstevel@tonic-gate int pr_ndx; 10330Sstevel@tonic-gate uint16_t pr_seq; 10340Sstevel@tonic-gate 10350Sstevel@tonic-gate pii = tg->tg_phyint_inst; 10360Sstevel@tonic-gate 10370Sstevel@tonic-gate /* 10380Sstevel@tonic-gate * Get the seq number of the most recent probe sent so far, 10390Sstevel@tonic-gate * and also get the corresponding probe index in the probe stats 10400Sstevel@tonic-gate * array. 10410Sstevel@tonic-gate */ 10420Sstevel@tonic-gate pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 10430Sstevel@tonic-gate pr_seq = pii->pii_snxt; 10440Sstevel@tonic-gate pr_seq--; 10450Sstevel@tonic-gate 10460Sstevel@tonic-gate /* 10470Sstevel@tonic-gate * Start from the most recent probe and walk back, trying to find 10480Sstevel@tonic-gate * an acked probe corresponding to target tg. 10490Sstevel@tonic-gate */ 10500Sstevel@tonic-gate for (; pr_ndx != pii->pii_probe_next; 10510Sstevel@tonic-gate pr_ndx = PROBE_INDEX_PREV(pr_ndx), pr_seq--) { 10520Sstevel@tonic-gate if (pii->pii_probes[pr_ndx].pr_target == tg && 10530Sstevel@tonic-gate pii->pii_probes[pr_ndx].pr_status == PR_ACKED) { 10540Sstevel@tonic-gate if (SEQ_GT(pr_seq, seq)) 10550Sstevel@tonic-gate return (_B_FALSE); 10560Sstevel@tonic-gate } 10570Sstevel@tonic-gate } 10580Sstevel@tonic-gate return (_B_TRUE); 10590Sstevel@tonic-gate } 10600Sstevel@tonic-gate 10610Sstevel@tonic-gate /* 10620Sstevel@tonic-gate * Check whether the crtt for the group has improved by a factor of 10630Sstevel@tonic-gate * LOWER_FDT_TRIGGER. Small crtt improvements are ignored to avoid failure 10640Sstevel@tonic-gate * detection time flapping in the face of small crtt changes. 10650Sstevel@tonic-gate */ 10660Sstevel@tonic-gate static boolean_t 10670Sstevel@tonic-gate check_pg_crtt_improved(struct phyint_group *pg) 10680Sstevel@tonic-gate { 10690Sstevel@tonic-gate struct phyint *pi; 10700Sstevel@tonic-gate 10710Sstevel@tonic-gate if (debug & D_PROBE) 10720Sstevel@tonic-gate logdebug("check_pg_crtt_improved()\n"); 10730Sstevel@tonic-gate 10740Sstevel@tonic-gate /* 10750Sstevel@tonic-gate * The crtt for the group is only improved if each phyint_instance 10760Sstevel@tonic-gate * for both ipv4 and ipv6 is improved. 10770Sstevel@tonic-gate */ 10780Sstevel@tonic-gate for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 10790Sstevel@tonic-gate if (!check_pii_crtt_improved(pi->pi_v4) || 10800Sstevel@tonic-gate !check_pii_crtt_improved(pi->pi_v6)) 10810Sstevel@tonic-gate return (_B_FALSE); 10820Sstevel@tonic-gate } 10830Sstevel@tonic-gate 10840Sstevel@tonic-gate return (_B_TRUE); 10850Sstevel@tonic-gate } 10860Sstevel@tonic-gate 10870Sstevel@tonic-gate /* 10880Sstevel@tonic-gate * Check whether the crtt has improved substantially on this phyint_instance. 10890Sstevel@tonic-gate * Returns _B_TRUE if there's no crtt information available, because pii 10900Sstevel@tonic-gate * is NULL or the phyint_instance is not capable of probing. 10910Sstevel@tonic-gate */ 10920Sstevel@tonic-gate boolean_t 10930Sstevel@tonic-gate check_pii_crtt_improved(struct phyint_instance *pii) { 10940Sstevel@tonic-gate struct target *tg; 10950Sstevel@tonic-gate 10960Sstevel@tonic-gate if (pii == NULL) 10970Sstevel@tonic-gate return (_B_TRUE); 10980Sstevel@tonic-gate 10990Sstevel@tonic-gate if (!PROBE_CAPABLE(pii) || 11000Sstevel@tonic-gate pii->pii_phyint->pi_state == PI_FAILED) 11010Sstevel@tonic-gate return (_B_TRUE); 11020Sstevel@tonic-gate 11030Sstevel@tonic-gate for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 11040Sstevel@tonic-gate if (tg->tg_status != TG_ACTIVE) 11050Sstevel@tonic-gate continue; 11060Sstevel@tonic-gate if (tg->tg_crtt > (pii->pii_phyint->pi_group->pg_probeint / 11070Sstevel@tonic-gate LOWER_FDT_TRIGGER)) { 11080Sstevel@tonic-gate return (_B_FALSE); 11090Sstevel@tonic-gate } 11100Sstevel@tonic-gate } 11110Sstevel@tonic-gate 11120Sstevel@tonic-gate return (_B_TRUE); 11130Sstevel@tonic-gate } 11140Sstevel@tonic-gate 11150Sstevel@tonic-gate /* 11160Sstevel@tonic-gate * This target responds very slowly to probes. The target's crtt exceeds 11170Sstevel@tonic-gate * the probe interval of its group. Compare against other targets 11180Sstevel@tonic-gate * and determine if this target is an exception, if so return true, else false 11190Sstevel@tonic-gate */ 11200Sstevel@tonic-gate static boolean_t 11210Sstevel@tonic-gate check_exception_target(struct phyint_instance *pii, struct target *target) 11220Sstevel@tonic-gate { 11230Sstevel@tonic-gate struct target *tg; 11240Sstevel@tonic-gate char abuf[INET6_ADDRSTRLEN]; 11250Sstevel@tonic-gate 11260Sstevel@tonic-gate if (debug & D_PROBE) { 11270Sstevel@tonic-gate logdebug("check_exception_target(%s %s target %s)\n", 11280Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name, 11290Sstevel@tonic-gate pr_addr(pii->pii_af, target->tg_address, 11300Sstevel@tonic-gate abuf, sizeof (abuf))); 11310Sstevel@tonic-gate } 11320Sstevel@tonic-gate 11330Sstevel@tonic-gate /* 11340Sstevel@tonic-gate * We should have at least MIN_PROBE_TARGETS + 1 good targets now, 11350Sstevel@tonic-gate * to make a good judgement. Otherwise don't drop this target. 11360Sstevel@tonic-gate */ 11370Sstevel@tonic-gate if (pii->pii_ntargets < MIN_PROBE_TARGETS + 1) 11380Sstevel@tonic-gate return (_B_FALSE); 11390Sstevel@tonic-gate 11400Sstevel@tonic-gate /* 11410Sstevel@tonic-gate * Determine whether only this particular target is slow. 11420Sstevel@tonic-gate * We know that this target's crtt exceeds the group's probe interval. 11430Sstevel@tonic-gate * If all other active targets have a 11440Sstevel@tonic-gate * crtt < (this group's probe interval) / EXCEPTION_FACTOR, 11450Sstevel@tonic-gate * then this target is considered slow. 11460Sstevel@tonic-gate */ 11470Sstevel@tonic-gate for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 11480Sstevel@tonic-gate if (tg != target && tg->tg_status == TG_ACTIVE) { 11490Sstevel@tonic-gate if (tg->tg_crtt > 11500Sstevel@tonic-gate pii->pii_phyint->pi_group->pg_probeint / 11510Sstevel@tonic-gate EXCEPTION_FACTOR) { 11520Sstevel@tonic-gate return (_B_FALSE); 11530Sstevel@tonic-gate } 11540Sstevel@tonic-gate } 11550Sstevel@tonic-gate } 11560Sstevel@tonic-gate 11570Sstevel@tonic-gate return (_B_TRUE); 11580Sstevel@tonic-gate } 11590Sstevel@tonic-gate 11600Sstevel@tonic-gate /* 11610Sstevel@tonic-gate * Update the target list. The icmp all hosts multicast has given us 11620Sstevel@tonic-gate * some host to which we can send probes. If we already have sufficient 11630Sstevel@tonic-gate * targets, discard it. 11640Sstevel@tonic-gate */ 11650Sstevel@tonic-gate static void 11660Sstevel@tonic-gate incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply, 11670Sstevel@tonic-gate struct in6_addr fromaddr) 11680Sstevel@tonic-gate /* ARGSUSED */ 11690Sstevel@tonic-gate { 11700Sstevel@tonic-gate int af; 11710Sstevel@tonic-gate char abuf[INET6_ADDRSTRLEN]; 11720Sstevel@tonic-gate struct phyint *pi; 11730Sstevel@tonic-gate 11740Sstevel@tonic-gate if (debug & D_PROBE) { 11750Sstevel@tonic-gate logdebug("incoming_mcast_reply(%s %s %s)\n", 11760Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name, 11770Sstevel@tonic-gate pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf))); 11780Sstevel@tonic-gate } 11790Sstevel@tonic-gate 11800Sstevel@tonic-gate /* 11810Sstevel@tonic-gate * Using host targets is a fallback mechanism. If we have 11820Sstevel@tonic-gate * found a router, don't add this host target. If we already 11830Sstevel@tonic-gate * know MAX_PROBE_TARGETS, don't add another target. 11840Sstevel@tonic-gate */ 11850Sstevel@tonic-gate assert(pii->pii_ntargets <= MAX_PROBE_TARGETS); 11860Sstevel@tonic-gate if (pii->pii_targets != NULL) { 11870Sstevel@tonic-gate if (pii->pii_targets_are_routers || 11880Sstevel@tonic-gate (pii->pii_ntargets == MAX_PROBE_TARGETS)) { 11890Sstevel@tonic-gate return; 11900Sstevel@tonic-gate } 11910Sstevel@tonic-gate } 11920Sstevel@tonic-gate 11930Sstevel@tonic-gate if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) || 11940Sstevel@tonic-gate IN6_IS_ADDR_V4MAPPED_ANY(&fromaddr)) { 11950Sstevel@tonic-gate /* 11960Sstevel@tonic-gate * Guard against response from 0.0.0.0 11970Sstevel@tonic-gate * and ::. Log a trace message 11980Sstevel@tonic-gate */ 11990Sstevel@tonic-gate logtrace("probe response from %s on %s\n", 12000Sstevel@tonic-gate pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)), 12010Sstevel@tonic-gate pii->pii_name); 12020Sstevel@tonic-gate return; 12030Sstevel@tonic-gate } 12040Sstevel@tonic-gate 12050Sstevel@tonic-gate /* 12060Sstevel@tonic-gate * This address is one of our own, so reject this address as a 12070Sstevel@tonic-gate * valid probe target. 12080Sstevel@tonic-gate */ 12090Sstevel@tonic-gate af = pii->pii_af; 12102250Srk129064 if (own_address(fromaddr)) 12110Sstevel@tonic-gate return; 12120Sstevel@tonic-gate 12130Sstevel@tonic-gate /* 12140Sstevel@tonic-gate * If the phyint is part a named group, then add the address to all 12150Sstevel@tonic-gate * members of the group. Otherwise, add the address only to the 12160Sstevel@tonic-gate * phyint itself, since other phyints in the anongroup may not be on 12170Sstevel@tonic-gate * the same subnet. 12180Sstevel@tonic-gate */ 12190Sstevel@tonic-gate pi = pii->pii_phyint; 12200Sstevel@tonic-gate if (pi->pi_group == phyint_anongroup) { 12210Sstevel@tonic-gate target_add(pii, fromaddr, _B_FALSE); 12220Sstevel@tonic-gate } else { 12230Sstevel@tonic-gate pi = pi->pi_group->pg_phyint; 12240Sstevel@tonic-gate for (; pi != NULL; pi = pi->pi_pgnext) 12250Sstevel@tonic-gate target_add(PHYINT_INSTANCE(pi, af), fromaddr, _B_FALSE); 12260Sstevel@tonic-gate } 12270Sstevel@tonic-gate } 12280Sstevel@tonic-gate 12290Sstevel@tonic-gate /* 12300Sstevel@tonic-gate * Compute CRTT given an existing scaled average, scaled deviation estimate 12310Sstevel@tonic-gate * and a new rtt time. The formula is from Jacobson and Karels' 12320Sstevel@tonic-gate * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names 12330Sstevel@tonic-gate * are the same as those in Appendix A.2 of that paper. 12340Sstevel@tonic-gate * 12350Sstevel@tonic-gate * m = new measurement 12360Sstevel@tonic-gate * sa = scaled RTT average (8 * average estimates) 12370Sstevel@tonic-gate * sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates). 12380Sstevel@tonic-gate * crtt = Conservative round trip time. Used to determine whether probe 12390Sstevel@tonic-gate * has timed out. 12400Sstevel@tonic-gate * 12410Sstevel@tonic-gate * New scaled average and deviation are passed back via sap and svp 12420Sstevel@tonic-gate */ 12430Sstevel@tonic-gate static int 12440Sstevel@tonic-gate compute_crtt(int *sap, int *svp, int m) 12450Sstevel@tonic-gate { 12460Sstevel@tonic-gate int sa = *sap; 12470Sstevel@tonic-gate int sv = *svp; 12480Sstevel@tonic-gate int crtt; 12490Sstevel@tonic-gate int saved_m = m; 12500Sstevel@tonic-gate 12510Sstevel@tonic-gate assert(*sap >= -1); 12520Sstevel@tonic-gate assert(*svp >= 0); 12530Sstevel@tonic-gate 12540Sstevel@tonic-gate if (sa != -1) { 12550Sstevel@tonic-gate /* 12560Sstevel@tonic-gate * Update average estimator: 12570Sstevel@tonic-gate * new rtt = old rtt + 1/8 Error 12580Sstevel@tonic-gate * where Error = m - old rtt 12590Sstevel@tonic-gate * i.e. 8 * new rtt = 8 * old rtt + Error 12600Sstevel@tonic-gate * i.e. new sa = old sa + Error 12610Sstevel@tonic-gate */ 12620Sstevel@tonic-gate m -= sa >> 3; /* m is now Error in estimate. */ 12630Sstevel@tonic-gate if ((sa += m) < 0) { 12640Sstevel@tonic-gate /* Don't allow the smoothed average to be negative. */ 12650Sstevel@tonic-gate sa = 0; 12660Sstevel@tonic-gate } 12670Sstevel@tonic-gate 12680Sstevel@tonic-gate /* 12690Sstevel@tonic-gate * Update deviation estimator: 12700Sstevel@tonic-gate * new mdev = old mdev + 1/4 (abs(Error) - old mdev) 12710Sstevel@tonic-gate * i.e. 4 * new mdev = 4 * old mdev + 12720Sstevel@tonic-gate * (abs(Error) - old mdev) 12730Sstevel@tonic-gate * i.e. new sv = old sv + (abs(Error) - old mdev) 12740Sstevel@tonic-gate */ 12750Sstevel@tonic-gate if (m < 0) 12760Sstevel@tonic-gate m = -m; 12770Sstevel@tonic-gate m -= sv >> 2; 12780Sstevel@tonic-gate sv += m; 12790Sstevel@tonic-gate } else { 12800Sstevel@tonic-gate /* Initialization. This is the first response received. */ 12810Sstevel@tonic-gate sa = (m << 3); 12820Sstevel@tonic-gate sv = (m << 1); 12830Sstevel@tonic-gate } 12840Sstevel@tonic-gate 12850Sstevel@tonic-gate crtt = (sa >> 3) + sv; 12860Sstevel@tonic-gate 12870Sstevel@tonic-gate if (debug & D_PROBE) { 12880Sstevel@tonic-gate logdebug("compute_crtt: m = %d sa = %d, sv = %d -> crtt = " 12890Sstevel@tonic-gate "%d\n", saved_m, sa, sv, crtt); 12900Sstevel@tonic-gate } 12910Sstevel@tonic-gate 12920Sstevel@tonic-gate *sap = sa; 12930Sstevel@tonic-gate *svp = sv; 12940Sstevel@tonic-gate 12950Sstevel@tonic-gate /* 12960Sstevel@tonic-gate * CRTT = average estimates + 4 * deviation estimates 12970Sstevel@tonic-gate * = sa / 8 + sv 12980Sstevel@tonic-gate */ 12990Sstevel@tonic-gate return (crtt); 13000Sstevel@tonic-gate } 13010Sstevel@tonic-gate 13020Sstevel@tonic-gate static void 13030Sstevel@tonic-gate pi_set_crtt(struct target *tg, int m, boolean_t is_probe_uni) 13040Sstevel@tonic-gate { 13050Sstevel@tonic-gate struct phyint_instance *pii = tg->tg_phyint_inst; 13060Sstevel@tonic-gate int probe_interval = pii->pii_phyint->pi_group->pg_probeint; 13070Sstevel@tonic-gate int sa = tg->tg_rtt_sa; 13080Sstevel@tonic-gate int sv = tg->tg_rtt_sd; 13090Sstevel@tonic-gate int new_crtt; 13100Sstevel@tonic-gate int i; 13110Sstevel@tonic-gate 13120Sstevel@tonic-gate if (debug & D_PROBE) 13130Sstevel@tonic-gate logdebug("pi_set_crtt: target - m %d\n", m); 13140Sstevel@tonic-gate 13150Sstevel@tonic-gate /* store the round trip time, in case we need to defer computation */ 13160Sstevel@tonic-gate tg->tg_deferred[tg->tg_num_deferred] = m; 13170Sstevel@tonic-gate 13180Sstevel@tonic-gate new_crtt = compute_crtt(&sa, &sv, m); 13190Sstevel@tonic-gate 13200Sstevel@tonic-gate /* 13210Sstevel@tonic-gate * If this probe's round trip time would singlehandedly cause an 13220Sstevel@tonic-gate * increase in the group's probe interval consider it suspect. 13230Sstevel@tonic-gate */ 13240Sstevel@tonic-gate if ((new_crtt > probe_interval) && is_probe_uni) { 13250Sstevel@tonic-gate if (debug & D_PROBE) { 13260Sstevel@tonic-gate logdebug("Received a suspect probe on %s, new_crtt =" 13270Sstevel@tonic-gate " %d, probe_interval = %d, num_deferred = %d\n", 13280Sstevel@tonic-gate pii->pii_probe_logint->li_name, new_crtt, 13290Sstevel@tonic-gate probe_interval, tg->tg_num_deferred); 13300Sstevel@tonic-gate } 13310Sstevel@tonic-gate 13320Sstevel@tonic-gate /* 13330Sstevel@tonic-gate * If we've deferred as many rtts as we plan on deferring, then 13340Sstevel@tonic-gate * assume the link really did slow down and process all queued 13350Sstevel@tonic-gate * rtts 13360Sstevel@tonic-gate */ 13370Sstevel@tonic-gate if (tg->tg_num_deferred == MAXDEFERREDRTT) { 13380Sstevel@tonic-gate if (debug & D_PROBE) { 13390Sstevel@tonic-gate logdebug("Received MAXDEFERREDRTT probes which " 13400Sstevel@tonic-gate "would cause an increased probe_interval. " 13410Sstevel@tonic-gate "Integrating queued rtt data points.\n"); 13420Sstevel@tonic-gate } 13430Sstevel@tonic-gate 13440Sstevel@tonic-gate for (i = 0; i <= tg->tg_num_deferred; i++) { 13450Sstevel@tonic-gate tg->tg_crtt = compute_crtt(&tg->tg_rtt_sa, 13460Sstevel@tonic-gate &tg->tg_rtt_sd, tg->tg_deferred[i]); 13470Sstevel@tonic-gate } 13480Sstevel@tonic-gate 13490Sstevel@tonic-gate tg->tg_num_deferred = 0; 13500Sstevel@tonic-gate } else { 13510Sstevel@tonic-gate tg->tg_num_deferred++; 13520Sstevel@tonic-gate } 13530Sstevel@tonic-gate return; 13540Sstevel@tonic-gate } 13550Sstevel@tonic-gate 13560Sstevel@tonic-gate /* 13570Sstevel@tonic-gate * If this is a normal probe, or an RTT probe that would lead to a 13580Sstevel@tonic-gate * reduced CRTT, then update our CRTT data. Further, if this was 13590Sstevel@tonic-gate * a normal probe, pitch any deferred probes since our probes are 13600Sstevel@tonic-gate * again being answered within our CRTT estimates. 13610Sstevel@tonic-gate */ 13620Sstevel@tonic-gate if (is_probe_uni || new_crtt < tg->tg_crtt) { 13630Sstevel@tonic-gate tg->tg_rtt_sa = sa; 13640Sstevel@tonic-gate tg->tg_rtt_sd = sv; 13650Sstevel@tonic-gate tg->tg_crtt = new_crtt; 13660Sstevel@tonic-gate if (is_probe_uni) 13670Sstevel@tonic-gate tg->tg_num_deferred = 0; 13680Sstevel@tonic-gate } 13690Sstevel@tonic-gate } 13700Sstevel@tonic-gate 13710Sstevel@tonic-gate /* 13720Sstevel@tonic-gate * Return a pointer to the specified option buffer. 13730Sstevel@tonic-gate * If not found return NULL. 13740Sstevel@tonic-gate */ 13750Sstevel@tonic-gate static void * 13760Sstevel@tonic-gate find_ancillary(struct msghdr *msg, int cmsg_type) 13770Sstevel@tonic-gate { 13780Sstevel@tonic-gate struct cmsghdr *cmsg; 13790Sstevel@tonic-gate 13800Sstevel@tonic-gate for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL; 13810Sstevel@tonic-gate cmsg = CMSG_NXTHDR(msg, cmsg)) { 13820Sstevel@tonic-gate if (cmsg->cmsg_level == IPPROTO_IPV6 && 13830Sstevel@tonic-gate cmsg->cmsg_type == cmsg_type) { 13840Sstevel@tonic-gate return (CMSG_DATA(cmsg)); 13850Sstevel@tonic-gate } 13860Sstevel@tonic-gate } 13870Sstevel@tonic-gate return (NULL); 13880Sstevel@tonic-gate } 13890Sstevel@tonic-gate 13900Sstevel@tonic-gate /* 13910Sstevel@tonic-gate * See if a previously failed interface has started working again. 13920Sstevel@tonic-gate */ 13930Sstevel@tonic-gate void 13940Sstevel@tonic-gate phyint_check_for_repair(struct phyint *pi) 13950Sstevel@tonic-gate { 13960Sstevel@tonic-gate if (phyint_repaired(pi)) { 13970Sstevel@tonic-gate if (pi->pi_group == phyint_anongroup) { 13980Sstevel@tonic-gate logerr("NIC repair detected on %s\n", pi->pi_name); 13990Sstevel@tonic-gate } else { 14000Sstevel@tonic-gate logerr("NIC repair detected on %s of group %s\n", 14010Sstevel@tonic-gate pi->pi_name, pi->pi_group->pg_name); 14020Sstevel@tonic-gate } 14030Sstevel@tonic-gate 14040Sstevel@tonic-gate /* 14050Sstevel@tonic-gate * If the interface is offline, just clear the FAILED flag, 14060Sstevel@tonic-gate * delaying the state change and failback operation until it 14070Sstevel@tonic-gate * is brought back online. 14080Sstevel@tonic-gate */ 14090Sstevel@tonic-gate if (pi->pi_state == PI_OFFLINE) { 14100Sstevel@tonic-gate (void) change_lif_flags(pi, IFF_FAILED, _B_FALSE); 14110Sstevel@tonic-gate return; 14120Sstevel@tonic-gate } 14130Sstevel@tonic-gate 1414704Sethindra if (pi->pi_flags & IFF_STANDBY) { 14150Sstevel@tonic-gate (void) change_lif_flags(pi, IFF_FAILED, _B_FALSE); 14160Sstevel@tonic-gate } else { 1417*2496Smeem if (try_failback(pi) != IPMP_FAILURE) { 14180Sstevel@tonic-gate (void) change_lif_flags(pi, 14190Sstevel@tonic-gate IFF_FAILED, _B_FALSE); 14200Sstevel@tonic-gate /* Per state diagram */ 14210Sstevel@tonic-gate pi->pi_empty = 0; 14220Sstevel@tonic-gate } 14230Sstevel@tonic-gate } 14240Sstevel@tonic-gate 14250Sstevel@tonic-gate phyint_chstate(pi, PI_RUNNING); 14260Sstevel@tonic-gate 14270Sstevel@tonic-gate if (GROUP_FAILED(pi->pi_group)) { 14280Sstevel@tonic-gate /* 14290Sstevel@tonic-gate * This is the 1st phyint to receive a response 14300Sstevel@tonic-gate * after group failure. 14310Sstevel@tonic-gate */ 14320Sstevel@tonic-gate logerr("At least 1 interface (%s) of group %s has " 14330Sstevel@tonic-gate "repaired\n", pi->pi_name, pi->pi_group->pg_name); 14340Sstevel@tonic-gate phyint_group_chstate(pi->pi_group, PG_RUNNING); 14352302Sudpa /* 14362302Sudpa * If this is the STANDBY phyint to be repaired after a 14372302Sudpa * group failure. Move data addresses on other failed 14382302Sudpa * phyints in the group to this one. 14392302Sudpa */ 14402302Sudpa if (pi->pi_flags & IFF_STANDBY) { 14412302Sudpa struct phyint *fpi = pi->pi_group->pg_phyint; 14422302Sudpa for (; fpi != NULL; fpi = fpi->pi_pgnext) { 14432302Sudpa if (fpi != pi) { 14442302Sudpa (void) try_failover(fpi, 14452302Sudpa FAILOVER_NORMAL); 14462302Sudpa } 14472302Sudpa } 14482302Sudpa } 14490Sstevel@tonic-gate } 14500Sstevel@tonic-gate } 14510Sstevel@tonic-gate } 14520Sstevel@tonic-gate 14530Sstevel@tonic-gate /* 14540Sstevel@tonic-gate * See if a previously functioning interface has failed, or if the 14550Sstevel@tonic-gate * whole group of interfaces has failed. 14560Sstevel@tonic-gate */ 14570Sstevel@tonic-gate static void 14580Sstevel@tonic-gate phyint_inst_check_for_failure(struct phyint_instance *pii) 14590Sstevel@tonic-gate { 14600Sstevel@tonic-gate struct phyint *pi; 14610Sstevel@tonic-gate struct phyint *pi2; 14620Sstevel@tonic-gate 14630Sstevel@tonic-gate pi = pii->pii_phyint; 14640Sstevel@tonic-gate 14650Sstevel@tonic-gate switch (failure_state(pii)) { 14660Sstevel@tonic-gate case PHYINT_FAILURE: 14670Sstevel@tonic-gate (void) change_lif_flags(pi, IFF_FAILED, _B_TRUE); 14680Sstevel@tonic-gate if (pi->pi_group == phyint_anongroup) { 14690Sstevel@tonic-gate logerr("NIC failure detected on %s\n", pii->pii_name); 14700Sstevel@tonic-gate } else { 14710Sstevel@tonic-gate logerr("NIC failure detected on %s of group %s\n", 14720Sstevel@tonic-gate pii->pii_name, pi->pi_group->pg_name); 14730Sstevel@tonic-gate } 14740Sstevel@tonic-gate /* 14750Sstevel@tonic-gate * Do the failover, unless the interface is offline (in 14760Sstevel@tonic-gate * which case we've already failed over). 14770Sstevel@tonic-gate */ 14780Sstevel@tonic-gate if (pi->pi_state != PI_OFFLINE) { 14790Sstevel@tonic-gate phyint_chstate(pi, PI_FAILED); 14800Sstevel@tonic-gate reset_crtt_all(pi); 14810Sstevel@tonic-gate if (!(pi->pi_flags & IFF_INACTIVE)) 14820Sstevel@tonic-gate (void) try_failover(pi, FAILOVER_NORMAL); 14830Sstevel@tonic-gate } 14840Sstevel@tonic-gate break; 14850Sstevel@tonic-gate 14860Sstevel@tonic-gate case GROUP_FAILURE: 14870Sstevel@tonic-gate logerr("All Interfaces in group %s have failed\n", 14880Sstevel@tonic-gate pi->pi_group->pg_name); 14890Sstevel@tonic-gate for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; 14900Sstevel@tonic-gate pi2 = pi2->pi_pgnext) { 14910Sstevel@tonic-gate if (pi2->pi_flags & IFF_OFFLINE) 14920Sstevel@tonic-gate continue; 14930Sstevel@tonic-gate (void) change_lif_flags(pi2, IFF_FAILED, _B_TRUE); 14940Sstevel@tonic-gate reset_crtt_all(pi2); 14950Sstevel@tonic-gate 14960Sstevel@tonic-gate /* 14970Sstevel@tonic-gate * In the case of host targets, we 14980Sstevel@tonic-gate * would have flushed the targets, 14990Sstevel@tonic-gate * and gone to PI_NOTARGETS state. 15000Sstevel@tonic-gate */ 15010Sstevel@tonic-gate if (pi2->pi_state == PI_RUNNING) 1502704Sethindra phyint_chstate(pi2, PI_FAILED); 15030Sstevel@tonic-gate 15040Sstevel@tonic-gate pi2->pi_empty = 0; 15050Sstevel@tonic-gate pi2->pi_full = 0; 15060Sstevel@tonic-gate } 15070Sstevel@tonic-gate break; 15080Sstevel@tonic-gate 15090Sstevel@tonic-gate default: 15100Sstevel@tonic-gate break; 15110Sstevel@tonic-gate } 15120Sstevel@tonic-gate } 15130Sstevel@tonic-gate 15140Sstevel@tonic-gate /* 15150Sstevel@tonic-gate * Determines if any timeout event has occurred and returns the number of 15160Sstevel@tonic-gate * milliseconds until the next timeout event for the phyint. Returns 15170Sstevel@tonic-gate * TIMER_INFINITY for "never". 15180Sstevel@tonic-gate */ 15190Sstevel@tonic-gate uint_t 15200Sstevel@tonic-gate phyint_inst_timer(struct phyint_instance *pii) 15210Sstevel@tonic-gate { 15220Sstevel@tonic-gate int pr_ndx; 15230Sstevel@tonic-gate uint_t timeout; 15240Sstevel@tonic-gate struct target *cur_tg; 15250Sstevel@tonic-gate struct probe_stats *pr_statp; 15260Sstevel@tonic-gate struct phyint_instance *pii_other; 15270Sstevel@tonic-gate struct phyint *pi; 15280Sstevel@tonic-gate int valid_unack_count; 15290Sstevel@tonic-gate int i; 15300Sstevel@tonic-gate int interval; 15310Sstevel@tonic-gate uint_t check_time; 15320Sstevel@tonic-gate uint_t cur_time; 15330Sstevel@tonic-gate hrtime_t cur_hrtime; 15340Sstevel@tonic-gate int probe_interval = pii->pii_phyint->pi_group->pg_probeint; 15350Sstevel@tonic-gate 15360Sstevel@tonic-gate cur_time = getcurrenttime(); 15370Sstevel@tonic-gate 15380Sstevel@tonic-gate if (debug & D_TIMER) { 15390Sstevel@tonic-gate logdebug("phyint_inst_timer(%s %s)\n", 15400Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name); 15410Sstevel@tonic-gate } 15420Sstevel@tonic-gate 15430Sstevel@tonic-gate pii_other = phyint_inst_other(pii); 15440Sstevel@tonic-gate if (!PROBE_ENABLED(pii) && !PROBE_ENABLED(pii_other)) { 15450Sstevel@tonic-gate /* 15460Sstevel@tonic-gate * Check to see if we're here due to link up/down flapping; If 15470Sstevel@tonic-gate * enough time has passed, then try to bring the interface 15480Sstevel@tonic-gate * back up; otherwise, schedule a timer to bring it back up 15490Sstevel@tonic-gate * when enough time *has* elapsed. 15500Sstevel@tonic-gate */ 15510Sstevel@tonic-gate pi = pii->pii_phyint; 15520Sstevel@tonic-gate if (pi->pi_state == PI_FAILED && LINK_UP(pi)) { 15530Sstevel@tonic-gate check_time = pi->pi_whenup[pi->pi_whendx] + MSEC_PERMIN; 15540Sstevel@tonic-gate if (check_time > cur_time) 15550Sstevel@tonic-gate return (check_time - cur_time); 15560Sstevel@tonic-gate 15570Sstevel@tonic-gate phyint_check_for_repair(pi); 15580Sstevel@tonic-gate } 15590Sstevel@tonic-gate } 15600Sstevel@tonic-gate 15610Sstevel@tonic-gate /* 1562*2496Smeem * If probing is not enabled on this phyint instance, don't proceed. 15630Sstevel@tonic-gate */ 1564*2496Smeem if (!PROBE_ENABLED(pii)) 15650Sstevel@tonic-gate return (TIMER_INFINITY); 15660Sstevel@tonic-gate 15670Sstevel@tonic-gate /* 15680Sstevel@tonic-gate * If the timer has fired too soon, probably triggered 15690Sstevel@tonic-gate * by some other phyint instance, return the remaining 15700Sstevel@tonic-gate * time 15710Sstevel@tonic-gate */ 15720Sstevel@tonic-gate if (TIME_LT(cur_time, pii->pii_snxt_time)) 15730Sstevel@tonic-gate return (pii->pii_snxt_time - cur_time); 15740Sstevel@tonic-gate 15750Sstevel@tonic-gate /* 15760Sstevel@tonic-gate * If the link is down, don't send any probes for now. 15770Sstevel@tonic-gate */ 15780Sstevel@tonic-gate if (LINK_DOWN(pii->pii_phyint)) 15790Sstevel@tonic-gate return (TIMER_INFINITY); 15800Sstevel@tonic-gate 15810Sstevel@tonic-gate /* 15820Sstevel@tonic-gate * Randomize the next probe time, between MIN_RANDOM_FACTOR 15830Sstevel@tonic-gate * and MAX_RANDOM_FACTOR with respect to the base probe time. 15840Sstevel@tonic-gate * Base probe time is strictly periodic. 15850Sstevel@tonic-gate */ 15860Sstevel@tonic-gate interval = GET_RANDOM( 15870Sstevel@tonic-gate (int)(MIN_RANDOM_FACTOR * user_probe_interval), 15880Sstevel@tonic-gate (int)(MAX_RANDOM_FACTOR * user_probe_interval)); 15890Sstevel@tonic-gate pii->pii_snxt_time = pii->pii_snxt_basetime + interval; 15900Sstevel@tonic-gate 15910Sstevel@tonic-gate /* 15920Sstevel@tonic-gate * Check if the current time > next time to probe. If so, we missed 15930Sstevel@tonic-gate * sending 1 or more probes, probably due to heavy system load. At least 15940Sstevel@tonic-gate * 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we 15950Sstevel@tonic-gate * were scheduled. Make adjustments to the times, in multiples of 15960Sstevel@tonic-gate * user_probe_interval. 15970Sstevel@tonic-gate */ 15980Sstevel@tonic-gate if (TIME_GT(cur_time, pii->pii_snxt_time)) { 15990Sstevel@tonic-gate int n; 16000Sstevel@tonic-gate 16010Sstevel@tonic-gate n = (cur_time - pii->pii_snxt_time) / user_probe_interval; 16020Sstevel@tonic-gate pii->pii_snxt_time += (n + 1) * user_probe_interval; 16030Sstevel@tonic-gate pii->pii_snxt_basetime += (n + 1) * user_probe_interval; 16040Sstevel@tonic-gate logtrace("missed sending %d probes cur_time %u snxt_time %u" 16050Sstevel@tonic-gate " snxt_basetime %u\n", n + 1, cur_time, pii->pii_snxt_time, 16060Sstevel@tonic-gate pii->pii_snxt_basetime); 16070Sstevel@tonic-gate 16080Sstevel@tonic-gate /* Collect statistics about missed probes */ 16090Sstevel@tonic-gate probes_missed.pm_nprobes += n + 1; 16100Sstevel@tonic-gate probes_missed.pm_ntimes++; 16110Sstevel@tonic-gate } 16120Sstevel@tonic-gate pii->pii_snxt_basetime += user_probe_interval; 16130Sstevel@tonic-gate interval = pii->pii_snxt_time - cur_time; 16140Sstevel@tonic-gate if (debug & D_TARGET) { 16150Sstevel@tonic-gate logdebug("cur_time %u snxt_time %u snxt_basetime %u" 16160Sstevel@tonic-gate " interval %u\n", cur_time, pii->pii_snxt_time, 16170Sstevel@tonic-gate pii->pii_snxt_basetime, interval); 16180Sstevel@tonic-gate } 16190Sstevel@tonic-gate 16200Sstevel@tonic-gate /* 16210Sstevel@tonic-gate * If no targets are known, we need to send an ICMP multicast. The 16220Sstevel@tonic-gate * probe type is PROBE_MULTI. We'll check back in 'interval' msec 16230Sstevel@tonic-gate * to see if we found a target. 16240Sstevel@tonic-gate */ 16250Sstevel@tonic-gate if (pii->pii_target_next == NULL) { 16260Sstevel@tonic-gate assert(pii->pii_ntargets == 0); 16270Sstevel@tonic-gate pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 16280Sstevel@tonic-gate probe(pii, PROBE_MULTI, cur_time); 16290Sstevel@tonic-gate return (interval); 16300Sstevel@tonic-gate } 16310Sstevel@tonic-gate 16320Sstevel@tonic-gate if ((user_probe_interval != probe_interval) && 16330Sstevel@tonic-gate TIME_LT(pii->pii_snxt_time, pii->pii_fd_snxt_basetime)) { 16340Sstevel@tonic-gate /* 16350Sstevel@tonic-gate * the failure detection (fd) probe timer has not yet fired. 16360Sstevel@tonic-gate * Need to send only an rtt probe. The probe type is PROBE_RTT. 16370Sstevel@tonic-gate */ 16380Sstevel@tonic-gate probe(pii, PROBE_RTT, cur_time); 16390Sstevel@tonic-gate return (interval); 16400Sstevel@tonic-gate } 16410Sstevel@tonic-gate /* 16420Sstevel@tonic-gate * the fd probe timer has fired. Need to do all failure 16430Sstevel@tonic-gate * detection / recovery calculations, and then send an fd probe 16440Sstevel@tonic-gate * of type PROBE_UNI. 16450Sstevel@tonic-gate */ 16460Sstevel@tonic-gate if (user_probe_interval == probe_interval) { 16470Sstevel@tonic-gate /* 16480Sstevel@tonic-gate * We could have missed some probes, and then adjusted 16490Sstevel@tonic-gate * pii_snxt_basetime above. Otherwise we could have 16500Sstevel@tonic-gate * blindly added probe_interval to pii_fd_snxt_basetime. 16510Sstevel@tonic-gate */ 16520Sstevel@tonic-gate pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 16530Sstevel@tonic-gate } else { 16540Sstevel@tonic-gate pii->pii_fd_snxt_basetime += probe_interval; 16550Sstevel@tonic-gate if (TIME_GT(cur_time, pii->pii_fd_snxt_basetime)) { 16560Sstevel@tonic-gate int n; 16570Sstevel@tonic-gate 16580Sstevel@tonic-gate n = (cur_time - pii->pii_fd_snxt_basetime) / 16590Sstevel@tonic-gate probe_interval; 16600Sstevel@tonic-gate pii->pii_fd_snxt_basetime += (n + 1) * probe_interval; 16610Sstevel@tonic-gate } 16620Sstevel@tonic-gate } 16630Sstevel@tonic-gate 16640Sstevel@tonic-gate /* 16650Sstevel@tonic-gate * We can have at most, the latest 2 probes that we sent, in 16660Sstevel@tonic-gate * the PR_UNACKED state. All previous probes sent, are either 16670Sstevel@tonic-gate * PR_LOST or PR_ACKED. An unacknowledged probe is considered 16680Sstevel@tonic-gate * timed out if the probe's time_sent + the CRTT < currenttime. 16690Sstevel@tonic-gate * For each of the last 2 probes, examine whether it has timed 16700Sstevel@tonic-gate * out. If so, mark it PR_LOST. The probe stats is a circular array. 16710Sstevel@tonic-gate */ 16720Sstevel@tonic-gate pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 16730Sstevel@tonic-gate valid_unack_count = 0; 16740Sstevel@tonic-gate 16750Sstevel@tonic-gate for (i = 0; i < 2; i++) { 16760Sstevel@tonic-gate pr_statp = &pii->pii_probes[pr_ndx]; 16770Sstevel@tonic-gate cur_tg = pii->pii_probes[pr_ndx].pr_target; 16780Sstevel@tonic-gate switch (pr_statp->pr_status) { 16790Sstevel@tonic-gate case PR_ACKED: 16800Sstevel@tonic-gate /* 16810Sstevel@tonic-gate * We received back an ACK, so the switch clearly 16820Sstevel@tonic-gate * is not dropping our traffic, and thus we can 16830Sstevel@tonic-gate * enable failure detection immediately. 16840Sstevel@tonic-gate */ 16850Sstevel@tonic-gate if (pii->pii_fd_hrtime > gethrtime()) { 16860Sstevel@tonic-gate if (debug & D_PROBE) { 16870Sstevel@tonic-gate logdebug("successful probe on %s; " 16880Sstevel@tonic-gate "ending quiet period\n", 16890Sstevel@tonic-gate pii->pii_phyint->pi_name); 16900Sstevel@tonic-gate } 16910Sstevel@tonic-gate pii->pii_fd_hrtime = gethrtime(); 16920Sstevel@tonic-gate } 16930Sstevel@tonic-gate break; 16940Sstevel@tonic-gate 16950Sstevel@tonic-gate case PR_UNACKED: 16960Sstevel@tonic-gate assert(cur_tg != NULL); 16970Sstevel@tonic-gate /* 16980Sstevel@tonic-gate * The crtt could be zero for some reason, 16990Sstevel@tonic-gate * Eg. the phyint could be failed. If the crtt is 17000Sstevel@tonic-gate * not available use group's probe interval, 17010Sstevel@tonic-gate * which is a worst case estimate. 17020Sstevel@tonic-gate */ 17030Sstevel@tonic-gate if (cur_tg->tg_crtt != 0) { 17040Sstevel@tonic-gate timeout = pr_statp->pr_time_sent + 17050Sstevel@tonic-gate cur_tg->tg_crtt; 17060Sstevel@tonic-gate } else { 17070Sstevel@tonic-gate timeout = pr_statp->pr_time_sent + 17080Sstevel@tonic-gate probe_interval; 17090Sstevel@tonic-gate } 17100Sstevel@tonic-gate if (TIME_LT(timeout, cur_time)) { 17110Sstevel@tonic-gate pr_statp->pr_status = PR_LOST; 17120Sstevel@tonic-gate pr_statp->pr_time_lost = timeout; 17130Sstevel@tonic-gate } else if (i == 1) { 17140Sstevel@tonic-gate /* 17150Sstevel@tonic-gate * We are forced to consider this probe 17160Sstevel@tonic-gate * lost, as we can have at most 2 unack. 17170Sstevel@tonic-gate * probes any time, and we will be sending a 17180Sstevel@tonic-gate * probe at the end of this function. 17190Sstevel@tonic-gate * Normally, we should not be here, but 17200Sstevel@tonic-gate * this can happen if an incoming response 17210Sstevel@tonic-gate * that was considered lost has increased 17220Sstevel@tonic-gate * the crtt for this target, and also bumped 17230Sstevel@tonic-gate * up the FDT. Note that we never cancel or 17240Sstevel@tonic-gate * increase the current pii_time_left, so 17250Sstevel@tonic-gate * when the timer fires, we find 2 valid 17260Sstevel@tonic-gate * unacked probes, and they are yet to timeout 17270Sstevel@tonic-gate */ 17280Sstevel@tonic-gate pr_statp->pr_status = PR_LOST; 17290Sstevel@tonic-gate pr_statp->pr_time_lost = cur_time; 17300Sstevel@tonic-gate } else { 17310Sstevel@tonic-gate /* 17320Sstevel@tonic-gate * Only the most recent probe can enter 17330Sstevel@tonic-gate * this 'else' arm. The second most recent 17340Sstevel@tonic-gate * probe must take either of the above arms, 17350Sstevel@tonic-gate * if it is unacked. 17360Sstevel@tonic-gate */ 17370Sstevel@tonic-gate valid_unack_count++; 17380Sstevel@tonic-gate } 17390Sstevel@tonic-gate break; 17400Sstevel@tonic-gate } 17410Sstevel@tonic-gate pr_ndx = PROBE_INDEX_PREV(pr_ndx); 17420Sstevel@tonic-gate } 17430Sstevel@tonic-gate 17440Sstevel@tonic-gate /* 17450Sstevel@tonic-gate * We send out 1 probe randomly in the interval between one half 17460Sstevel@tonic-gate * and one probe interval for the group. Given that the CRTT is always 17470Sstevel@tonic-gate * less than the group's probe interval, we can have at most 1 17480Sstevel@tonic-gate * unacknowledged probe now. All previous probes are either lost or 17490Sstevel@tonic-gate * acked. 17500Sstevel@tonic-gate */ 17510Sstevel@tonic-gate assert(valid_unack_count == 0 || valid_unack_count == 1); 17520Sstevel@tonic-gate 17530Sstevel@tonic-gate /* 17540Sstevel@tonic-gate * The timer has fired. Take appropriate action depending 17550Sstevel@tonic-gate * on the current state of the phyint. 17560Sstevel@tonic-gate * 17570Sstevel@tonic-gate * PI_RUNNING state - Failure detection and failover 17580Sstevel@tonic-gate * PI_FAILED state - Repair detection and failback 17590Sstevel@tonic-gate */ 17600Sstevel@tonic-gate switch (pii->pii_phyint->pi_state) { 17610Sstevel@tonic-gate case PI_FAILED: 17620Sstevel@tonic-gate /* 17630Sstevel@tonic-gate * If the most recent probe (excluding unacked probes that 17640Sstevel@tonic-gate * are yet to time out) has been acked, check whether the 17650Sstevel@tonic-gate * phyint is now repaired. If the phyint is repaired, then 17660Sstevel@tonic-gate * attempt failback, unless it is an inactive standby. 17670Sstevel@tonic-gate */ 17680Sstevel@tonic-gate if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) { 17690Sstevel@tonic-gate phyint_check_for_repair(pii->pii_phyint); 17700Sstevel@tonic-gate } 17710Sstevel@tonic-gate break; 17720Sstevel@tonic-gate 17730Sstevel@tonic-gate case PI_RUNNING: 17740Sstevel@tonic-gate /* 17750Sstevel@tonic-gate * It's possible our probes have been lost because of a 17760Sstevel@tonic-gate * spanning-tree mandated quiet period on the switch. If so, 17770Sstevel@tonic-gate * ignore the lost probes and consider the interface to still 17780Sstevel@tonic-gate * be functioning. 17790Sstevel@tonic-gate */ 17800Sstevel@tonic-gate cur_hrtime = gethrtime(); 17810Sstevel@tonic-gate if (pii->pii_fd_hrtime - cur_hrtime > 0) 17820Sstevel@tonic-gate break; 17830Sstevel@tonic-gate 17840Sstevel@tonic-gate if (pii->pii_rack + valid_unack_count + 1 != pii->pii_snxt) { 17850Sstevel@tonic-gate /* 17860Sstevel@tonic-gate * We have 1 or more failed probes (excluding unacked 17870Sstevel@tonic-gate * probes that are yet to time out). Determine if the 17880Sstevel@tonic-gate * phyint has failed. If so attempt a failover, 17890Sstevel@tonic-gate * unless it is an inactive standby 17900Sstevel@tonic-gate */ 17910Sstevel@tonic-gate phyint_inst_check_for_failure(pii); 17920Sstevel@tonic-gate } 17930Sstevel@tonic-gate break; 17940Sstevel@tonic-gate 17950Sstevel@tonic-gate default: 17960Sstevel@tonic-gate logerr("phyint_inst_timer: invalid state %d\n", 17970Sstevel@tonic-gate pii->pii_phyint->pi_state); 17980Sstevel@tonic-gate abort(); 17990Sstevel@tonic-gate } 18000Sstevel@tonic-gate 18010Sstevel@tonic-gate /* 18020Sstevel@tonic-gate * Start the next probe. probe() will also set pii->pii_probe_time_left 18030Sstevel@tonic-gate * to the group's probe interval. If phyint_failed -> target_flush_hosts 18040Sstevel@tonic-gate * was called, the target list may be empty. 18050Sstevel@tonic-gate */ 18060Sstevel@tonic-gate if (pii->pii_target_next != NULL) { 18070Sstevel@tonic-gate probe(pii, PROBE_UNI, cur_time); 18080Sstevel@tonic-gate /* 18090Sstevel@tonic-gate * If we have just the one probe target, and we're not using 18100Sstevel@tonic-gate * router targets, try to find another as we presently have 18110Sstevel@tonic-gate * no resilience. 18120Sstevel@tonic-gate */ 18130Sstevel@tonic-gate if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1) 18140Sstevel@tonic-gate probe(pii, PROBE_MULTI, cur_time); 18150Sstevel@tonic-gate } else { 18160Sstevel@tonic-gate probe(pii, PROBE_MULTI, cur_time); 18170Sstevel@tonic-gate } 18180Sstevel@tonic-gate return (interval); 18190Sstevel@tonic-gate } 18200Sstevel@tonic-gate 18210Sstevel@tonic-gate /* 18220Sstevel@tonic-gate * Start the probe timer for an interface instance. 18230Sstevel@tonic-gate */ 18240Sstevel@tonic-gate void 18250Sstevel@tonic-gate start_timer(struct phyint_instance *pii) 18260Sstevel@tonic-gate { 18270Sstevel@tonic-gate uint32_t interval; 18280Sstevel@tonic-gate 18290Sstevel@tonic-gate /* 18300Sstevel@tonic-gate * Spread the base probe times (pi_snxt_basetime) across phyints 18310Sstevel@tonic-gate * uniformly over the (curtime..curtime + the group's probe_interval). 18320Sstevel@tonic-gate * pi_snxt_basetime is strictly periodic with a frequency of 18330Sstevel@tonic-gate * the group's probe interval. The actual probe time pi_snxt_time 18340Sstevel@tonic-gate * adds some randomness to pi_snxt_basetime and happens in probe(). 18350Sstevel@tonic-gate * For the 1st probe on each phyint after the timer is started, 18360Sstevel@tonic-gate * pi_snxt_time and pi_snxt_basetime are the same. 18370Sstevel@tonic-gate */ 18380Sstevel@tonic-gate interval = GET_RANDOM(0, 18390Sstevel@tonic-gate (int)pii->pii_phyint->pi_group->pg_probeint); 18400Sstevel@tonic-gate 18410Sstevel@tonic-gate pii->pii_snxt_basetime = getcurrenttime() + interval; 18420Sstevel@tonic-gate pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 18430Sstevel@tonic-gate pii->pii_snxt_time = pii->pii_snxt_basetime; 18440Sstevel@tonic-gate timer_schedule(interval); 18450Sstevel@tonic-gate } 18460Sstevel@tonic-gate 18470Sstevel@tonic-gate /* 18480Sstevel@tonic-gate * Restart the probe timer on an interface instance. 18490Sstevel@tonic-gate */ 18500Sstevel@tonic-gate static void 18510Sstevel@tonic-gate restart_timer(struct phyint_instance *pii) 18520Sstevel@tonic-gate { 18530Sstevel@tonic-gate /* 18540Sstevel@tonic-gate * We don't need to restart the timer if it was never started in 18550Sstevel@tonic-gate * the first place (pii->pii_basetime_inited not set), as the timer 18560Sstevel@tonic-gate * won't have gone off yet. 18570Sstevel@tonic-gate */ 18580Sstevel@tonic-gate if (pii->pii_basetime_inited != 0) { 18590Sstevel@tonic-gate 18600Sstevel@tonic-gate if (debug & D_LINKNOTE) 18610Sstevel@tonic-gate logdebug("restart timer: restarting timer on %s, " 18620Sstevel@tonic-gate "address family %s\n", pii->pii_phyint->pi_name, 18630Sstevel@tonic-gate AF_STR(pii->pii_af)); 18640Sstevel@tonic-gate 18650Sstevel@tonic-gate start_timer(pii); 18660Sstevel@tonic-gate } 18670Sstevel@tonic-gate } 18680Sstevel@tonic-gate 18690Sstevel@tonic-gate static void 18700Sstevel@tonic-gate process_link_state_down(struct phyint *pi) 18710Sstevel@tonic-gate { 18720Sstevel@tonic-gate logerr("The link has gone down on %s\n", pi->pi_name); 18730Sstevel@tonic-gate 18740Sstevel@tonic-gate /* 18750Sstevel@tonic-gate * Clear the probe statistics arrays, we don't want the repair 18760Sstevel@tonic-gate * detection logic relying on probes that were succesful prior 18770Sstevel@tonic-gate * to the link going down. 18780Sstevel@tonic-gate */ 18790Sstevel@tonic-gate if (PROBE_CAPABLE(pi->pi_v4)) 18800Sstevel@tonic-gate clear_pii_probe_stats(pi->pi_v4); 18810Sstevel@tonic-gate if (PROBE_CAPABLE(pi->pi_v6)) 18820Sstevel@tonic-gate clear_pii_probe_stats(pi->pi_v6); 18830Sstevel@tonic-gate /* 18840Sstevel@tonic-gate * Check for interface failure. Although we know the interface 18850Sstevel@tonic-gate * has failed, we don't know if all the other interfaces in the 18860Sstevel@tonic-gate * group have failed as well. 18870Sstevel@tonic-gate */ 18880Sstevel@tonic-gate if ((pi->pi_state == PI_RUNNING) || 18890Sstevel@tonic-gate (pi->pi_state != PI_FAILED && !GROUP_FAILED(pi->pi_group))) { 18900Sstevel@tonic-gate if (debug & D_LINKNOTE) { 18910Sstevel@tonic-gate logdebug("process_link_state_down:" 18920Sstevel@tonic-gate " checking for failure on %s\n", pi->pi_name); 18930Sstevel@tonic-gate } 18940Sstevel@tonic-gate 18950Sstevel@tonic-gate if (pi->pi_v4 != NULL) 18960Sstevel@tonic-gate phyint_inst_check_for_failure(pi->pi_v4); 18970Sstevel@tonic-gate else if (pi->pi_v6 != NULL) 18980Sstevel@tonic-gate phyint_inst_check_for_failure(pi->pi_v6); 18990Sstevel@tonic-gate } 19000Sstevel@tonic-gate } 19010Sstevel@tonic-gate 19020Sstevel@tonic-gate static void 19030Sstevel@tonic-gate process_link_state_up(struct phyint *pi) 19040Sstevel@tonic-gate { 19050Sstevel@tonic-gate logerr("The link has come up on %s\n", pi->pi_name); 19060Sstevel@tonic-gate 19070Sstevel@tonic-gate /* 19080Sstevel@tonic-gate * We stopped any running timers on each instance when the link 19090Sstevel@tonic-gate * went down, so restart them. 19100Sstevel@tonic-gate */ 19110Sstevel@tonic-gate if (pi->pi_v4) 19120Sstevel@tonic-gate restart_timer(pi->pi_v4); 19130Sstevel@tonic-gate if (pi->pi_v6) 19140Sstevel@tonic-gate restart_timer(pi->pi_v6); 19150Sstevel@tonic-gate 19160Sstevel@tonic-gate phyint_check_for_repair(pi); 19170Sstevel@tonic-gate 19180Sstevel@tonic-gate pi->pi_whenup[pi->pi_whendx++] = getcurrenttime(); 19190Sstevel@tonic-gate if (pi->pi_whendx == LINK_UP_PERMIN) 19200Sstevel@tonic-gate pi->pi_whendx = 0; 19210Sstevel@tonic-gate } 19220Sstevel@tonic-gate 19230Sstevel@tonic-gate /* 19240Sstevel@tonic-gate * Process any changes in link state passed up from the interfaces. 19250Sstevel@tonic-gate */ 19260Sstevel@tonic-gate void 19270Sstevel@tonic-gate process_link_state_changes(void) 19280Sstevel@tonic-gate { 19290Sstevel@tonic-gate struct phyint *pi; 19300Sstevel@tonic-gate 19310Sstevel@tonic-gate /* Look for interfaces where the link state has just changed */ 19320Sstevel@tonic-gate 19330Sstevel@tonic-gate for (pi = phyints; pi != NULL; pi = pi->pi_next) { 19340Sstevel@tonic-gate boolean_t old_link_state_up = LINK_UP(pi); 19350Sstevel@tonic-gate 19360Sstevel@tonic-gate /* 19370Sstevel@tonic-gate * Except when the "phyint" structure is created, this is 19380Sstevel@tonic-gate * the only place the link state is updated. This allows 19390Sstevel@tonic-gate * this routine to detect changes in link state, rather 19400Sstevel@tonic-gate * than just the current state. 19410Sstevel@tonic-gate */ 19420Sstevel@tonic-gate UPDATE_LINK_STATE(pi); 19430Sstevel@tonic-gate 19440Sstevel@tonic-gate if (LINK_DOWN(pi)) { 19450Sstevel@tonic-gate /* 19460Sstevel@tonic-gate * Has link just gone down? 19470Sstevel@tonic-gate */ 19480Sstevel@tonic-gate if (old_link_state_up) 19490Sstevel@tonic-gate process_link_state_down(pi); 19500Sstevel@tonic-gate } else { 19510Sstevel@tonic-gate /* 19520Sstevel@tonic-gate * Has link just gone back up? 19530Sstevel@tonic-gate */ 19540Sstevel@tonic-gate if (!old_link_state_up) 19550Sstevel@tonic-gate process_link_state_up(pi); 19560Sstevel@tonic-gate } 19570Sstevel@tonic-gate } 19580Sstevel@tonic-gate } 19590Sstevel@tonic-gate 19600Sstevel@tonic-gate void 19610Sstevel@tonic-gate reset_crtt_all(struct phyint *pi) 19620Sstevel@tonic-gate { 19630Sstevel@tonic-gate struct phyint_instance *pii; 19640Sstevel@tonic-gate struct target *tg; 19650Sstevel@tonic-gate 19660Sstevel@tonic-gate pii = pi->pi_v4; 19670Sstevel@tonic-gate if (pii != NULL) { 19680Sstevel@tonic-gate for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 19690Sstevel@tonic-gate tg->tg_crtt = 0; 19700Sstevel@tonic-gate tg->tg_rtt_sa = -1; 19710Sstevel@tonic-gate tg->tg_rtt_sd = 0; 19720Sstevel@tonic-gate } 19730Sstevel@tonic-gate } 19740Sstevel@tonic-gate 19750Sstevel@tonic-gate pii = pi->pi_v6; 19760Sstevel@tonic-gate if (pii != NULL) { 19770Sstevel@tonic-gate for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 19780Sstevel@tonic-gate tg->tg_crtt = 0; 19790Sstevel@tonic-gate tg->tg_rtt_sa = -1; 19800Sstevel@tonic-gate tg->tg_rtt_sd = 0; 19810Sstevel@tonic-gate } 19820Sstevel@tonic-gate } 19830Sstevel@tonic-gate } 19840Sstevel@tonic-gate 19850Sstevel@tonic-gate /* 19860Sstevel@tonic-gate * Check if the phyint has failed the last NUM_PROBE_FAILS consecutive 19870Sstevel@tonic-gate * probes on both instances IPv4 and IPv6. 19880Sstevel@tonic-gate * If the interface has failed, return the time of the first probe failure 19890Sstevel@tonic-gate * in "tff". 19900Sstevel@tonic-gate */ 19910Sstevel@tonic-gate static int 19920Sstevel@tonic-gate phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff) 19930Sstevel@tonic-gate { 19940Sstevel@tonic-gate uint_t pi_tff; 19950Sstevel@tonic-gate struct target *cur_tg; 19960Sstevel@tonic-gate struct probe_fail_count pfinfo; 19970Sstevel@tonic-gate struct phyint_instance *pii_other; 19980Sstevel@tonic-gate int pr_ndx; 19990Sstevel@tonic-gate 20000Sstevel@tonic-gate /* 20010Sstevel@tonic-gate * Get the number of consecutive failed probes on 20020Sstevel@tonic-gate * this phyint across all targets. Also get the number 20030Sstevel@tonic-gate * of consecutive failed probes on this target only 20040Sstevel@tonic-gate */ 20050Sstevel@tonic-gate pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 20060Sstevel@tonic-gate cur_tg = pii->pii_probes[pr_ndx].pr_target; 20070Sstevel@tonic-gate probe_fail_info(pii, cur_tg, &pfinfo); 20080Sstevel@tonic-gate 20090Sstevel@tonic-gate /* Get the time of first failure, for later use */ 20100Sstevel@tonic-gate pi_tff = pfinfo.pf_tff; 20110Sstevel@tonic-gate 20120Sstevel@tonic-gate /* 20130Sstevel@tonic-gate * If the current target has not responded to the 20140Sstevel@tonic-gate * last NUM_PROBE_FAILS probes, and other targets are 20150Sstevel@tonic-gate * responding delete this target. Dead gateway detection 20160Sstevel@tonic-gate * will eventually remove this target (if router) from the 20170Sstevel@tonic-gate * routing tables. If that does not occur, we may end 20180Sstevel@tonic-gate * up adding this to our list again. 20190Sstevel@tonic-gate */ 20200Sstevel@tonic-gate if (pfinfo.pf_nfail < NUM_PROBE_FAILS && 20210Sstevel@tonic-gate pfinfo.pf_nfail_tg >= NUM_PROBE_FAILS) { 20220Sstevel@tonic-gate if (pii->pii_targets_are_routers) { 20230Sstevel@tonic-gate if (cur_tg->tg_status == TG_ACTIVE) 20240Sstevel@tonic-gate pii->pii_ntargets--; 20250Sstevel@tonic-gate cur_tg->tg_status = TG_DEAD; 20260Sstevel@tonic-gate cur_tg->tg_crtt = 0; 20270Sstevel@tonic-gate cur_tg->tg_rtt_sa = -1; 20280Sstevel@tonic-gate cur_tg->tg_rtt_sd = 0; 20290Sstevel@tonic-gate if (pii->pii_target_next == cur_tg) 20300Sstevel@tonic-gate pii->pii_target_next = target_next(cur_tg); 20310Sstevel@tonic-gate } else { 20320Sstevel@tonic-gate target_delete(cur_tg); 20330Sstevel@tonic-gate probe(pii, PROBE_MULTI, getcurrenttime()); 20340Sstevel@tonic-gate } 20350Sstevel@tonic-gate return (PHYINT_OK); 20360Sstevel@tonic-gate } 20370Sstevel@tonic-gate 20380Sstevel@tonic-gate /* 20390Sstevel@tonic-gate * If the phyint has lost NUM_PROBE_FAILS or more 20400Sstevel@tonic-gate * consecutive probes, on both IPv4 and IPv6 protocol 20410Sstevel@tonic-gate * instances of the phyint, then trigger failure 20420Sstevel@tonic-gate * detection, else return false 20430Sstevel@tonic-gate */ 20440Sstevel@tonic-gate if (pfinfo.pf_nfail < NUM_PROBE_FAILS) 20450Sstevel@tonic-gate return (PHYINT_OK); 20460Sstevel@tonic-gate 20470Sstevel@tonic-gate pii_other = phyint_inst_other(pii); 20480Sstevel@tonic-gate if (PROBE_CAPABLE(pii_other)) { 20490Sstevel@tonic-gate probe_fail_info(pii_other, NULL, &pfinfo); 20500Sstevel@tonic-gate if (pfinfo.pf_nfail >= NUM_PROBE_FAILS) { 20510Sstevel@tonic-gate /* 20520Sstevel@tonic-gate * We have NUM_PROBE_FAILS or more failures 20530Sstevel@tonic-gate * on both IPv4 and IPv6. Get the earliest 20540Sstevel@tonic-gate * time when failure was detected on this 20550Sstevel@tonic-gate * phyint across IPv4 and IPv6. 20560Sstevel@tonic-gate */ 20570Sstevel@tonic-gate if (TIME_LT(pfinfo.pf_tff, pi_tff)) 20580Sstevel@tonic-gate pi_tff = pfinfo.pf_tff; 20590Sstevel@tonic-gate } else { 20600Sstevel@tonic-gate /* 20610Sstevel@tonic-gate * This instance has < NUM_PROBE_FAILS failure. 20620Sstevel@tonic-gate * So return false 20630Sstevel@tonic-gate */ 20640Sstevel@tonic-gate return (PHYINT_OK); 20650Sstevel@tonic-gate } 20660Sstevel@tonic-gate } 20670Sstevel@tonic-gate *tff = pi_tff; 20680Sstevel@tonic-gate return (PHYINT_FAILURE); 20690Sstevel@tonic-gate } 20700Sstevel@tonic-gate 20710Sstevel@tonic-gate /* 20720Sstevel@tonic-gate * Check if the link has gone down on this phyint, or it has failed the 20730Sstevel@tonic-gate * last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6. 20740Sstevel@tonic-gate * Also look at other phyints of this group, for group failures. 20750Sstevel@tonic-gate */ 20760Sstevel@tonic-gate int 20770Sstevel@tonic-gate failure_state(struct phyint_instance *pii) 20780Sstevel@tonic-gate { 20790Sstevel@tonic-gate struct probe_success_count psinfo; 20800Sstevel@tonic-gate uint_t pi2_tls; /* time last success */ 20810Sstevel@tonic-gate uint_t pi_tff; /* time first fail */ 20820Sstevel@tonic-gate struct phyint *pi2; 20830Sstevel@tonic-gate struct phyint *pi; 20840Sstevel@tonic-gate struct phyint_instance *pii2; 20850Sstevel@tonic-gate struct phyint_group *pg; 20860Sstevel@tonic-gate boolean_t alone; 20870Sstevel@tonic-gate 20880Sstevel@tonic-gate if (debug & D_FAILOVER) 20890Sstevel@tonic-gate logdebug("phyint_failed(%s)\n", pii->pii_name); 20900Sstevel@tonic-gate 20910Sstevel@tonic-gate pi = pii->pii_phyint; 20920Sstevel@tonic-gate pg = pi->pi_group; 20930Sstevel@tonic-gate 20940Sstevel@tonic-gate if (LINK_UP(pi) && phyint_inst_probe_failure_state(pii, &pi_tff) == 20950Sstevel@tonic-gate PHYINT_OK) 20960Sstevel@tonic-gate return (PHYINT_OK); 20970Sstevel@tonic-gate 20980Sstevel@tonic-gate /* 20990Sstevel@tonic-gate * At this point, the link is down, or the phyint is suspect, 21000Sstevel@tonic-gate * as it has lost NUM_PROBE_FAILS or more probes. If the phyint 21010Sstevel@tonic-gate * does not belong to any group, or is the only member of the 21020Sstevel@tonic-gate * group capable of being probed, return PHYINT_FAILURE. 21030Sstevel@tonic-gate */ 21040Sstevel@tonic-gate alone = _B_TRUE; 21050Sstevel@tonic-gate if (pg != phyint_anongroup) { 21060Sstevel@tonic-gate for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { 21070Sstevel@tonic-gate if (pi2 == pi) 21080Sstevel@tonic-gate continue; 21090Sstevel@tonic-gate if (PROBE_CAPABLE(pi2->pi_v4) || 21100Sstevel@tonic-gate PROBE_CAPABLE(pi2->pi_v6)) { 21110Sstevel@tonic-gate alone = _B_FALSE; 21120Sstevel@tonic-gate break; 21130Sstevel@tonic-gate } 21140Sstevel@tonic-gate } 21150Sstevel@tonic-gate } 21160Sstevel@tonic-gate if (alone) 21170Sstevel@tonic-gate return (PHYINT_FAILURE); 21180Sstevel@tonic-gate 21190Sstevel@tonic-gate /* 21200Sstevel@tonic-gate * Need to compare against other phyints of the same group 21210Sstevel@tonic-gate * to exclude group failures. If the failure was detected via 21220Sstevel@tonic-gate * probing, then if the time of last success (tls) of any 21230Sstevel@tonic-gate * phyint is more recent than the time of first fail (tff) of the 21240Sstevel@tonic-gate * phyint in question, and the link is up on the phyint, 21250Sstevel@tonic-gate * then it is a phyint failure. Otherwise it is a group failure. 21260Sstevel@tonic-gate * If failure was detected via a link down notification sent from 21270Sstevel@tonic-gate * the driver to IP, we see if any phyints in the group are still 21280Sstevel@tonic-gate * running and haven't received a link down notification. We 21290Sstevel@tonic-gate * will usually be processing the link down notification shortly 21300Sstevel@tonic-gate * after it was received, so there is no point looking at the tls 21310Sstevel@tonic-gate * of other phyints. 21320Sstevel@tonic-gate */ 21330Sstevel@tonic-gate for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { 21340Sstevel@tonic-gate /* Exclude ourself from comparison */ 21350Sstevel@tonic-gate if (pi2 == pi) 21360Sstevel@tonic-gate continue; 21370Sstevel@tonic-gate 21380Sstevel@tonic-gate if (LINK_DOWN(pi)) { 21390Sstevel@tonic-gate /* 21400Sstevel@tonic-gate * We use FLAGS_TO_LINK_STATE() to test the 21410Sstevel@tonic-gate * flags directly, rather then LINK_UP() or 21420Sstevel@tonic-gate * LINK_DOWN(), as we may not have got round 21430Sstevel@tonic-gate * to processing the link state for the other 21440Sstevel@tonic-gate * phyints in the group yet. 21450Sstevel@tonic-gate * 21460Sstevel@tonic-gate * The check for PI_RUNNING and group 21470Sstevel@tonic-gate * failure handles the case when the 21480Sstevel@tonic-gate * group begins to recover. The first 21490Sstevel@tonic-gate * phyint to recover should not trigger 21500Sstevel@tonic-gate * a failover from the soon-to-recover 21510Sstevel@tonic-gate * other phyints to the first recovered 21520Sstevel@tonic-gate * phyint. PI_RUNNING will be set, and 21530Sstevel@tonic-gate * pg_groupfailed cleared only after 21540Sstevel@tonic-gate * receipt of NUM_PROBE_REPAIRS, by 21550Sstevel@tonic-gate * which time the other phyints should 21560Sstevel@tonic-gate * have received at least 1 packet, 21570Sstevel@tonic-gate * and so will not have NUM_PROBE_FAILS. 21580Sstevel@tonic-gate */ 21590Sstevel@tonic-gate if ((pi2->pi_state == PI_RUNNING) && 21600Sstevel@tonic-gate !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) 21610Sstevel@tonic-gate return (PHYINT_FAILURE); 21620Sstevel@tonic-gate } else { 21630Sstevel@tonic-gate /* 21640Sstevel@tonic-gate * Need to compare against both IPv4 and 21650Sstevel@tonic-gate * IPv6 instances. 21660Sstevel@tonic-gate */ 21670Sstevel@tonic-gate pii2 = pi2->pi_v4; 21680Sstevel@tonic-gate if (pii2 != NULL) { 21690Sstevel@tonic-gate probe_success_info(pii2, NULL, &psinfo); 21700Sstevel@tonic-gate if (psinfo.ps_tls_valid) { 21710Sstevel@tonic-gate pi2_tls = psinfo.ps_tls; 21720Sstevel@tonic-gate /* 21730Sstevel@tonic-gate * See comment above regarding check 21740Sstevel@tonic-gate * for PI_RUNNING and group failure. 21750Sstevel@tonic-gate */ 21760Sstevel@tonic-gate if (TIME_GT(pi2_tls, pi_tff) && 21770Sstevel@tonic-gate (pi2->pi_state == PI_RUNNING) && 21780Sstevel@tonic-gate !GROUP_FAILED(pg) && 21790Sstevel@tonic-gate FLAGS_TO_LINK_STATE(pi2)) 21800Sstevel@tonic-gate return (PHYINT_FAILURE); 21810Sstevel@tonic-gate } 21820Sstevel@tonic-gate } 21830Sstevel@tonic-gate 21840Sstevel@tonic-gate pii2 = pi2->pi_v6; 21850Sstevel@tonic-gate if (pii2 != NULL) { 21860Sstevel@tonic-gate probe_success_info(pii2, NULL, &psinfo); 21870Sstevel@tonic-gate if (psinfo.ps_tls_valid) { 21880Sstevel@tonic-gate pi2_tls = psinfo.ps_tls; 21890Sstevel@tonic-gate /* 21900Sstevel@tonic-gate * See comment above regarding check 21910Sstevel@tonic-gate * for PI_RUNNING and group failure. 21920Sstevel@tonic-gate */ 21930Sstevel@tonic-gate if (TIME_GT(pi2_tls, pi_tff) && 21940Sstevel@tonic-gate (pi2->pi_state == PI_RUNNING) && 21950Sstevel@tonic-gate !GROUP_FAILED(pg) && 21960Sstevel@tonic-gate FLAGS_TO_LINK_STATE(pi2)) 21970Sstevel@tonic-gate return (PHYINT_FAILURE); 21980Sstevel@tonic-gate } 21990Sstevel@tonic-gate } 22000Sstevel@tonic-gate } 22010Sstevel@tonic-gate } 22020Sstevel@tonic-gate 22030Sstevel@tonic-gate /* 22040Sstevel@tonic-gate * Change the group state to PG_FAILED if it's not already. 22050Sstevel@tonic-gate */ 22060Sstevel@tonic-gate if (!GROUP_FAILED(pg)) 22070Sstevel@tonic-gate phyint_group_chstate(pg, PG_FAILED); 22080Sstevel@tonic-gate 22090Sstevel@tonic-gate return (GROUP_FAILURE); 22100Sstevel@tonic-gate } 22110Sstevel@tonic-gate 22120Sstevel@tonic-gate /* 22130Sstevel@tonic-gate * Return the information associated with consecutive probe successes 22140Sstevel@tonic-gate * starting with the most recent probe. At most the last 2 probes can be 22150Sstevel@tonic-gate * in the unacknowledged state. All previous probes have either failed 22160Sstevel@tonic-gate * or succeeded. 22170Sstevel@tonic-gate */ 22180Sstevel@tonic-gate static void 22190Sstevel@tonic-gate probe_success_info(struct phyint_instance *pii, struct target *cur_tg, 22200Sstevel@tonic-gate struct probe_success_count *psinfo) 22210Sstevel@tonic-gate { 22220Sstevel@tonic-gate uint_t i; 22230Sstevel@tonic-gate struct probe_stats *pr_statp; 22240Sstevel@tonic-gate uint_t most_recent; 22250Sstevel@tonic-gate uint_t second_most_recent; 22260Sstevel@tonic-gate boolean_t pi_found_failure = _B_FALSE; 22270Sstevel@tonic-gate boolean_t tg_found_failure = _B_FALSE; 22280Sstevel@tonic-gate uint_t now; 22290Sstevel@tonic-gate uint_t timeout; 22300Sstevel@tonic-gate struct target *tg; 22310Sstevel@tonic-gate 22320Sstevel@tonic-gate if (debug & D_FAILOVER) 22330Sstevel@tonic-gate logdebug("probe_success_info(%s)\n", pii->pii_name); 22340Sstevel@tonic-gate 22350Sstevel@tonic-gate bzero(psinfo, sizeof (*psinfo)); 22360Sstevel@tonic-gate now = getcurrenttime(); 22370Sstevel@tonic-gate 22380Sstevel@tonic-gate /* 22390Sstevel@tonic-gate * Start with the most recent probe, and count the number 22400Sstevel@tonic-gate * of consecutive probe successes. Latch the number of successes 22410Sstevel@tonic-gate * on hitting a failure. 22420Sstevel@tonic-gate */ 22430Sstevel@tonic-gate most_recent = PROBE_INDEX_PREV(pii->pii_probe_next); 22440Sstevel@tonic-gate second_most_recent = PROBE_INDEX_PREV(most_recent); 22450Sstevel@tonic-gate 22460Sstevel@tonic-gate for (i = most_recent; i != pii->pii_probe_next; 22470Sstevel@tonic-gate i = PROBE_INDEX_PREV(i)) { 22480Sstevel@tonic-gate pr_statp = &pii->pii_probes[i]; 22490Sstevel@tonic-gate 22500Sstevel@tonic-gate switch (pr_statp->pr_status) { 22510Sstevel@tonic-gate case PR_UNACKED: 22520Sstevel@tonic-gate /* 22530Sstevel@tonic-gate * Only the most recent 2 probes can be unacknowledged 22540Sstevel@tonic-gate */ 22550Sstevel@tonic-gate assert(i == most_recent || i == second_most_recent); 22560Sstevel@tonic-gate 22570Sstevel@tonic-gate tg = pr_statp->pr_target; 22580Sstevel@tonic-gate assert(tg != NULL); 22590Sstevel@tonic-gate /* 22600Sstevel@tonic-gate * The crtt could be zero for some reason, 22610Sstevel@tonic-gate * Eg. the phyint could be failed. If the crtt is 22620Sstevel@tonic-gate * not available use the value of the group's probe 22630Sstevel@tonic-gate * interval which is a worst case estimate. 22640Sstevel@tonic-gate */ 22650Sstevel@tonic-gate if (tg->tg_crtt != 0) { 22660Sstevel@tonic-gate timeout = pr_statp->pr_time_sent + tg->tg_crtt; 22670Sstevel@tonic-gate } else { 22680Sstevel@tonic-gate timeout = pr_statp->pr_time_sent + 22690Sstevel@tonic-gate pii->pii_phyint->pi_group->pg_probeint; 22700Sstevel@tonic-gate } 22710Sstevel@tonic-gate 22720Sstevel@tonic-gate if (TIME_LT(timeout, now)) { 22730Sstevel@tonic-gate /* 22740Sstevel@tonic-gate * We hit a failure. Latch the total number of 22750Sstevel@tonic-gate * recent consecutive successes. 22760Sstevel@tonic-gate */ 22770Sstevel@tonic-gate pr_statp->pr_time_lost = timeout; 22780Sstevel@tonic-gate pr_statp->pr_status = PR_LOST; 22790Sstevel@tonic-gate pi_found_failure = _B_TRUE; 22800Sstevel@tonic-gate if (cur_tg != NULL && tg == cur_tg) { 22810Sstevel@tonic-gate /* 22820Sstevel@tonic-gate * We hit a failure for the desired 22830Sstevel@tonic-gate * target. Latch the number of recent 22840Sstevel@tonic-gate * consecutive successes for this target 22850Sstevel@tonic-gate */ 22860Sstevel@tonic-gate tg_found_failure = _B_TRUE; 22870Sstevel@tonic-gate } 22880Sstevel@tonic-gate } 22890Sstevel@tonic-gate break; 22900Sstevel@tonic-gate 22910Sstevel@tonic-gate case PR_ACKED: 22920Sstevel@tonic-gate /* 22930Sstevel@tonic-gate * Bump up the count of probe successes, if we 22940Sstevel@tonic-gate * have not seen any failure so far. 22950Sstevel@tonic-gate */ 22960Sstevel@tonic-gate if (!pi_found_failure) 22970Sstevel@tonic-gate psinfo->ps_nsucc++; 22980Sstevel@tonic-gate 22990Sstevel@tonic-gate if (cur_tg != NULL && pr_statp->pr_target == cur_tg && 23000Sstevel@tonic-gate !tg_found_failure) { 23010Sstevel@tonic-gate psinfo->ps_nsucc_tg++; 23020Sstevel@tonic-gate } 23030Sstevel@tonic-gate 23040Sstevel@tonic-gate /* 23050Sstevel@tonic-gate * Record the time of last success, if this is 23060Sstevel@tonic-gate * the most recent probe success. 23070Sstevel@tonic-gate */ 23080Sstevel@tonic-gate if (!psinfo->ps_tls_valid) { 23090Sstevel@tonic-gate psinfo->ps_tls = pr_statp->pr_time_acked; 23100Sstevel@tonic-gate psinfo->ps_tls_valid = _B_TRUE; 23110Sstevel@tonic-gate } 23120Sstevel@tonic-gate break; 23130Sstevel@tonic-gate 23140Sstevel@tonic-gate case PR_LOST: 23150Sstevel@tonic-gate /* 23160Sstevel@tonic-gate * We hit a failure. Latch the total number of 23170Sstevel@tonic-gate * recent consecutive successes. 23180Sstevel@tonic-gate */ 23190Sstevel@tonic-gate pi_found_failure = _B_TRUE; 23200Sstevel@tonic-gate if (cur_tg != NULL && pr_statp->pr_target == cur_tg) { 23210Sstevel@tonic-gate /* 23220Sstevel@tonic-gate * We hit a failure for the desired target. 23230Sstevel@tonic-gate * Latch the number of recent consecutive 23240Sstevel@tonic-gate * successes for this target 23250Sstevel@tonic-gate */ 23260Sstevel@tonic-gate tg_found_failure = _B_TRUE; 23270Sstevel@tonic-gate } 23280Sstevel@tonic-gate break; 23290Sstevel@tonic-gate 23300Sstevel@tonic-gate default: 23310Sstevel@tonic-gate return; 23320Sstevel@tonic-gate 23330Sstevel@tonic-gate } 23340Sstevel@tonic-gate } 23350Sstevel@tonic-gate } 23360Sstevel@tonic-gate 23370Sstevel@tonic-gate /* 23380Sstevel@tonic-gate * Return the information associated with consecutive probe failures 23390Sstevel@tonic-gate * starting with the most recent probe. Only the last 2 probes can be in the 23400Sstevel@tonic-gate * unacknowledged state. All previous probes have either failed or succeeded. 23410Sstevel@tonic-gate */ 23420Sstevel@tonic-gate static void 23430Sstevel@tonic-gate probe_fail_info(struct phyint_instance *pii, struct target *cur_tg, 23440Sstevel@tonic-gate struct probe_fail_count *pfinfo) 23450Sstevel@tonic-gate { 23460Sstevel@tonic-gate int i; 23470Sstevel@tonic-gate struct probe_stats *pr_statp; 23480Sstevel@tonic-gate boolean_t tg_found_success = _B_FALSE; 23490Sstevel@tonic-gate boolean_t pi_found_success = _B_FALSE; 23500Sstevel@tonic-gate int most_recent; 23510Sstevel@tonic-gate int second_most_recent; 23520Sstevel@tonic-gate uint_t now; 23530Sstevel@tonic-gate uint_t timeout; 23540Sstevel@tonic-gate struct target *tg; 23550Sstevel@tonic-gate 23560Sstevel@tonic-gate if (debug & D_FAILOVER) 23570Sstevel@tonic-gate logdebug("probe_fail_info(%s)\n", pii->pii_name); 23580Sstevel@tonic-gate 23590Sstevel@tonic-gate bzero(pfinfo, sizeof (*pfinfo)); 23600Sstevel@tonic-gate now = getcurrenttime(); 23610Sstevel@tonic-gate 23620Sstevel@tonic-gate /* 23630Sstevel@tonic-gate * Start with the most recent probe, and count the number 23640Sstevel@tonic-gate * of consecutive probe failures. Latch the number of failures 23650Sstevel@tonic-gate * on hitting a probe success. 23660Sstevel@tonic-gate */ 23670Sstevel@tonic-gate most_recent = PROBE_INDEX_PREV(pii->pii_probe_next); 23680Sstevel@tonic-gate second_most_recent = PROBE_INDEX_PREV(most_recent); 23690Sstevel@tonic-gate 23700Sstevel@tonic-gate for (i = most_recent; i != pii->pii_probe_next; 23710Sstevel@tonic-gate i = PROBE_INDEX_PREV(i)) { 23720Sstevel@tonic-gate pr_statp = &pii->pii_probes[i]; 23730Sstevel@tonic-gate 23740Sstevel@tonic-gate assert(PR_STATUS_VALID(pr_statp->pr_status)); 23750Sstevel@tonic-gate 23760Sstevel@tonic-gate switch (pr_statp->pr_status) { 23770Sstevel@tonic-gate case PR_UNACKED: 23780Sstevel@tonic-gate /* 23790Sstevel@tonic-gate * Only the most recent 2 probes can be unacknowledged 23800Sstevel@tonic-gate */ 23810Sstevel@tonic-gate assert(i == most_recent || i == second_most_recent); 23820Sstevel@tonic-gate 23830Sstevel@tonic-gate tg = pr_statp->pr_target; 23840Sstevel@tonic-gate /* 23850Sstevel@tonic-gate * Target is guaranteed to exist in the unack. state 23860Sstevel@tonic-gate */ 23870Sstevel@tonic-gate assert(tg != NULL); 23880Sstevel@tonic-gate /* 23890Sstevel@tonic-gate * The crtt could be zero for some reason, 23900Sstevel@tonic-gate * Eg. the phyint could be failed. If the crtt is 23910Sstevel@tonic-gate * not available use the group's probe interval, 23920Sstevel@tonic-gate * which is a worst case estimate. 23930Sstevel@tonic-gate */ 23940Sstevel@tonic-gate if (tg->tg_crtt != 0) { 23950Sstevel@tonic-gate timeout = pr_statp->pr_time_sent + tg->tg_crtt; 23960Sstevel@tonic-gate } else { 23970Sstevel@tonic-gate timeout = pr_statp->pr_time_sent + 23980Sstevel@tonic-gate pii->pii_phyint->pi_group->pg_probeint; 23990Sstevel@tonic-gate } 24000Sstevel@tonic-gate 24010Sstevel@tonic-gate if (TIME_GT(timeout, now)) 24020Sstevel@tonic-gate break; 24030Sstevel@tonic-gate 24040Sstevel@tonic-gate pr_statp->pr_time_lost = timeout; 24050Sstevel@tonic-gate pr_statp->pr_status = PR_LOST; 24060Sstevel@tonic-gate /* FALLTHRU */ 24070Sstevel@tonic-gate 24080Sstevel@tonic-gate case PR_LOST: 24090Sstevel@tonic-gate if (!pi_found_success) { 24100Sstevel@tonic-gate pfinfo->pf_nfail++; 24110Sstevel@tonic-gate pfinfo->pf_tff = pr_statp->pr_time_lost; 24120Sstevel@tonic-gate } 24130Sstevel@tonic-gate if (cur_tg != NULL && pr_statp->pr_target == cur_tg && 24140Sstevel@tonic-gate !tg_found_success) { 24150Sstevel@tonic-gate pfinfo->pf_nfail_tg++; 24160Sstevel@tonic-gate } 24170Sstevel@tonic-gate break; 24180Sstevel@tonic-gate 24190Sstevel@tonic-gate default: 24200Sstevel@tonic-gate /* 24210Sstevel@tonic-gate * We hit a success or unused slot. Latch the 24220Sstevel@tonic-gate * total number of recent consecutive failures. 24230Sstevel@tonic-gate */ 24240Sstevel@tonic-gate pi_found_success = _B_TRUE; 24250Sstevel@tonic-gate if (cur_tg != NULL && pr_statp->pr_target == cur_tg) { 24260Sstevel@tonic-gate /* 24270Sstevel@tonic-gate * We hit a success for the desired target. 24280Sstevel@tonic-gate * Latch the number of recent consecutive 24290Sstevel@tonic-gate * failures for this target 24300Sstevel@tonic-gate */ 24310Sstevel@tonic-gate tg_found_success = _B_TRUE; 24320Sstevel@tonic-gate } 24330Sstevel@tonic-gate } 24340Sstevel@tonic-gate } 24350Sstevel@tonic-gate } 24360Sstevel@tonic-gate 24370Sstevel@tonic-gate /* 24380Sstevel@tonic-gate * Check if the phyint has been repaired. If no test address has been 24390Sstevel@tonic-gate * configured, then consider the interface repaired if the link is up (unless 24400Sstevel@tonic-gate * the link is flapping; see below). Otherwise, look for proof of probes 24410Sstevel@tonic-gate * being sent and received. If last NUM_PROBE_REPAIRS probes are fine on 24420Sstevel@tonic-gate * either IPv4 or IPv6 instance, the phyint can be considered repaired. 24430Sstevel@tonic-gate */ 24440Sstevel@tonic-gate static boolean_t 24450Sstevel@tonic-gate phyint_repaired(struct phyint *pi) 24460Sstevel@tonic-gate { 24470Sstevel@tonic-gate struct probe_success_count psinfo; 24480Sstevel@tonic-gate struct phyint_instance *pii; 24490Sstevel@tonic-gate struct target *cur_tg; 24500Sstevel@tonic-gate int pr_ndx; 24510Sstevel@tonic-gate uint_t cur_time; 24520Sstevel@tonic-gate 24530Sstevel@tonic-gate if (debug & D_FAILOVER) 24540Sstevel@tonic-gate logdebug("phyint_repaired(%s)\n", pi->pi_name); 24550Sstevel@tonic-gate 24560Sstevel@tonic-gate if (LINK_DOWN(pi)) 24570Sstevel@tonic-gate return (_B_FALSE); 24580Sstevel@tonic-gate 24590Sstevel@tonic-gate /* 24600Sstevel@tonic-gate * If we don't have any test addresses and the link is up, then 24610Sstevel@tonic-gate * consider the interface repaired, unless we've received more than 24620Sstevel@tonic-gate * LINK_UP_PERMIN link up notifications in the last minute, in 24630Sstevel@tonic-gate * which case we keep the link down until we drop back below 24640Sstevel@tonic-gate * the threshold. 24650Sstevel@tonic-gate */ 24660Sstevel@tonic-gate if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) { 24670Sstevel@tonic-gate cur_time = getcurrenttime(); 24680Sstevel@tonic-gate if ((pi->pi_whenup[pi->pi_whendx] == 0 || 24690Sstevel@tonic-gate (cur_time - pi->pi_whenup[pi->pi_whendx]) > MSEC_PERMIN)) { 24700Sstevel@tonic-gate pi->pi_lfmsg_printed = 0; 24710Sstevel@tonic-gate return (_B_TRUE); 24720Sstevel@tonic-gate } 24730Sstevel@tonic-gate if (!pi->pi_lfmsg_printed) { 24740Sstevel@tonic-gate logerr("The link has come up on %s more than %d times " 24750Sstevel@tonic-gate "in the last minute; disabling failback until it " 24760Sstevel@tonic-gate "stabilizes\n", pi->pi_name, LINK_UP_PERMIN); 24770Sstevel@tonic-gate pi->pi_lfmsg_printed = 1; 24780Sstevel@tonic-gate } 24790Sstevel@tonic-gate 24800Sstevel@tonic-gate return (_B_FALSE); 24810Sstevel@tonic-gate } 24820Sstevel@tonic-gate 24830Sstevel@tonic-gate pii = pi->pi_v4; 24840Sstevel@tonic-gate if (PROBE_CAPABLE(pii)) { 24850Sstevel@tonic-gate pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 24860Sstevel@tonic-gate cur_tg = pii->pii_probes[pr_ndx].pr_target; 24870Sstevel@tonic-gate probe_success_info(pii, cur_tg, &psinfo); 24880Sstevel@tonic-gate if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS || 24890Sstevel@tonic-gate psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS) 24900Sstevel@tonic-gate return (_B_TRUE); 24910Sstevel@tonic-gate } 24920Sstevel@tonic-gate 24930Sstevel@tonic-gate pii = pi->pi_v6; 24940Sstevel@tonic-gate if (PROBE_CAPABLE(pii)) { 24950Sstevel@tonic-gate pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 24960Sstevel@tonic-gate cur_tg = pii->pii_probes[pr_ndx].pr_target; 24970Sstevel@tonic-gate probe_success_info(pii, cur_tg, &psinfo); 24980Sstevel@tonic-gate if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS || 24990Sstevel@tonic-gate psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS) 25000Sstevel@tonic-gate return (_B_TRUE); 25010Sstevel@tonic-gate } 25020Sstevel@tonic-gate 25030Sstevel@tonic-gate return (_B_FALSE); 25040Sstevel@tonic-gate } 25050Sstevel@tonic-gate 25060Sstevel@tonic-gate /* 25070Sstevel@tonic-gate * Try failover from phyint 'pi' to a suitable destination. 25080Sstevel@tonic-gate */ 25090Sstevel@tonic-gate int 25100Sstevel@tonic-gate try_failover(struct phyint *pi, int failover_type) 25110Sstevel@tonic-gate { 25120Sstevel@tonic-gate struct phyint *dst; 25130Sstevel@tonic-gate int err; 25140Sstevel@tonic-gate 25150Sstevel@tonic-gate if (debug & D_FAILOVER) 25160Sstevel@tonic-gate logdebug("try_failover(%s %d)\n", pi->pi_name, failover_type); 25170Sstevel@tonic-gate 25180Sstevel@tonic-gate /* 25190Sstevel@tonic-gate * Attempt to find a failover destination 'dst'. 25200Sstevel@tonic-gate * dst will be null if any of the following is true 25210Sstevel@tonic-gate * Phyint is not part of a group OR 25220Sstevel@tonic-gate * Phyint is the only member of a group OR 25230Sstevel@tonic-gate * No suitable failover dst was available 25240Sstevel@tonic-gate */ 25250Sstevel@tonic-gate dst = get_failover_dst(pi, failover_type); 25260Sstevel@tonic-gate if (dst == NULL) 25270Sstevel@tonic-gate return (IPMP_EMINRED); 25280Sstevel@tonic-gate 25290Sstevel@tonic-gate dst->pi_empty = 0; /* Per state diagram */ 25300Sstevel@tonic-gate pi->pi_full = 0; /* Per state diagram */ 25310Sstevel@tonic-gate 25320Sstevel@tonic-gate err = failover(pi, dst); 25330Sstevel@tonic-gate 25340Sstevel@tonic-gate if (debug & D_FAILOVER) { 25350Sstevel@tonic-gate logdebug("failed over from %s to %s ret %d\n", 25360Sstevel@tonic-gate pi->pi_name, dst->pi_name, err); 25370Sstevel@tonic-gate } 25380Sstevel@tonic-gate if (err == 0) { 25390Sstevel@tonic-gate pi->pi_empty = 1; /* Per state diagram */ 25400Sstevel@tonic-gate /* 25410Sstevel@tonic-gate * we don't want to print out this message if a 25420Sstevel@tonic-gate * phyint is leaving the group, nor for failover from 25430Sstevel@tonic-gate * standby 25440Sstevel@tonic-gate */ 25450Sstevel@tonic-gate if (failover_type == FAILOVER_NORMAL) { 25460Sstevel@tonic-gate logerr("Successfully failed over from NIC %s to NIC " 25470Sstevel@tonic-gate "%s\n", pi->pi_name, dst->pi_name); 25480Sstevel@tonic-gate } 25490Sstevel@tonic-gate return (0); 25500Sstevel@tonic-gate } else { 25510Sstevel@tonic-gate /* 25520Sstevel@tonic-gate * The failover did not succeed. We must retry the failover 25530Sstevel@tonic-gate * only after resyncing our state based on the kernel's. 25540Sstevel@tonic-gate * For eg. either the src or the dst might have been unplumbed 25550Sstevel@tonic-gate * causing this failure. initifs() will be called again, 25560Sstevel@tonic-gate * from main, since full_scan_required has been set to true 25570Sstevel@tonic-gate * by failover(); 25580Sstevel@tonic-gate */ 25590Sstevel@tonic-gate return (IPMP_FAILURE); 25600Sstevel@tonic-gate } 25610Sstevel@tonic-gate } 25620Sstevel@tonic-gate 25630Sstevel@tonic-gate /* 25640Sstevel@tonic-gate * global_errno captures the errno value, if failover() or failback() 25650Sstevel@tonic-gate * fails. This is sent to if_mpadm(1M). 25660Sstevel@tonic-gate */ 25670Sstevel@tonic-gate int global_errno; 25680Sstevel@tonic-gate 25690Sstevel@tonic-gate /* 25700Sstevel@tonic-gate * Attempt failover from phyint 'from' to phyint 'to'. 25710Sstevel@tonic-gate * IP moves everything from phyint 'from' to phyint 'to'. 25720Sstevel@tonic-gate */ 25730Sstevel@tonic-gate static int 25740Sstevel@tonic-gate failover(struct phyint *from, struct phyint *to) 25750Sstevel@tonic-gate { 25760Sstevel@tonic-gate struct lifreq lifr; 25770Sstevel@tonic-gate int ret; 25780Sstevel@tonic-gate 25790Sstevel@tonic-gate if (debug & D_FAILOVER) { 25800Sstevel@tonic-gate logdebug("failing over from %s to %s\n", 25810Sstevel@tonic-gate from->pi_name, to->pi_name); 25820Sstevel@tonic-gate } 25830Sstevel@tonic-gate 25840Sstevel@tonic-gate /* 25850Sstevel@tonic-gate * Perform the failover. Both IPv4 and IPv6 are failed over 25860Sstevel@tonic-gate * using a single ioctl by passing in AF_UNSPEC family. 25870Sstevel@tonic-gate */ 25880Sstevel@tonic-gate lifr.lifr_addr.ss_family = AF_UNSPEC; 25890Sstevel@tonic-gate (void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name)); 25900Sstevel@tonic-gate lifr.lifr_movetoindex = to->pi_ifindex; 25910Sstevel@tonic-gate 25920Sstevel@tonic-gate ret = ioctl(ifsock_v4, SIOCLIFFAILOVER, (caddr_t)&lifr); 25930Sstevel@tonic-gate if (ret < 0) { 25940Sstevel@tonic-gate global_errno = errno; 25950Sstevel@tonic-gate logperror("failover: ioctl (failover)"); 25960Sstevel@tonic-gate } 25970Sstevel@tonic-gate 25980Sstevel@tonic-gate /* 25990Sstevel@tonic-gate * Set full_scan_required to true. This will make us read 26000Sstevel@tonic-gate * the state from the kernel in initifs() and update our tables, 26010Sstevel@tonic-gate * to reflect the current state after the failover. If the 26020Sstevel@tonic-gate * failover has failed it will then reissue the failover. 26030Sstevel@tonic-gate */ 26040Sstevel@tonic-gate full_scan_required = _B_TRUE; 26050Sstevel@tonic-gate return (ret); 26060Sstevel@tonic-gate } 26070Sstevel@tonic-gate 26080Sstevel@tonic-gate /* 26090Sstevel@tonic-gate * phyint 'pi' has recovered. Attempt failback from every phyint in the same 26100Sstevel@tonic-gate * group as phyint 'pi' that is a potential failback source, to phyint 'pi'. 26110Sstevel@tonic-gate * Return values: 26120Sstevel@tonic-gate * IPMP_SUCCESS: Failback successful from each of the other 26130Sstevel@tonic-gate * phyints in the group. 26140Sstevel@tonic-gate * IPMP_EFBPARTIAL: Failback successful from some of the other 26150Sstevel@tonic-gate * phyints in the group. 26160Sstevel@tonic-gate * IPMP_FAILURE: Failback syscall failed with some error. 26170Sstevel@tonic-gate * 26180Sstevel@tonic-gate * Note that failback is attempted regardless of the setting of the 26190Sstevel@tonic-gate * failback_enabled flag. 26200Sstevel@tonic-gate */ 26210Sstevel@tonic-gate int 2622*2496Smeem do_failback(struct phyint *pi) 26230Sstevel@tonic-gate { 26240Sstevel@tonic-gate struct phyint *from; 26250Sstevel@tonic-gate boolean_t done; 26260Sstevel@tonic-gate boolean_t partial; 26270Sstevel@tonic-gate boolean_t attempted_failback = _B_FALSE; 26280Sstevel@tonic-gate 26290Sstevel@tonic-gate if (debug & D_FAILOVER) 26300Sstevel@tonic-gate logdebug("do_failback(%s)\n", pi->pi_name); 26310Sstevel@tonic-gate 26320Sstevel@tonic-gate /* If this phyint is not part of a named group, return. */ 26330Sstevel@tonic-gate if (pi->pi_group == phyint_anongroup) { 26340Sstevel@tonic-gate pi->pi_full = 1; 26350Sstevel@tonic-gate return (IPMP_SUCCESS); 26360Sstevel@tonic-gate } 26370Sstevel@tonic-gate 26380Sstevel@tonic-gate /* 26390Sstevel@tonic-gate * Attempt failback from every phyint in the group to 'pi'. 26400Sstevel@tonic-gate * The reason for doing this, instead of only from the 26410Sstevel@tonic-gate * phyint to which we did the failover is given below. 26420Sstevel@tonic-gate * 26430Sstevel@tonic-gate * After 'pi' failed, if any app. tries to join on a multicast 26440Sstevel@tonic-gate * address (IPv6), on the failed phyint, IP picks any arbitrary 26450Sstevel@tonic-gate * non-failed phyint in the group, instead of the failed phyint, 26460Sstevel@tonic-gate * in.mpathd is not aware of this. Thus failing back only from the 26470Sstevel@tonic-gate * interface to which 'pi' failed over, will failback the ipif's 26480Sstevel@tonic-gate * but not the ilm's. So we need to failback from all members of 26490Sstevel@tonic-gate * the phyint group 26500Sstevel@tonic-gate */ 26510Sstevel@tonic-gate done = _B_TRUE; 26520Sstevel@tonic-gate partial = _B_FALSE; 26530Sstevel@tonic-gate for (from = pi->pi_group->pg_phyint; from != NULL; 26540Sstevel@tonic-gate from = from->pi_pgnext) { 26550Sstevel@tonic-gate /* Exclude ourself as a failback src */ 26560Sstevel@tonic-gate if (from == pi) 26570Sstevel@tonic-gate continue; 26580Sstevel@tonic-gate 26590Sstevel@tonic-gate /* 26600Sstevel@tonic-gate * If the 'from' phyint has IPv4 plumbed, the 'to' 26610Sstevel@tonic-gate * phyint must also have IPv4 plumbed. Similar check 26620Sstevel@tonic-gate * for IPv6. IP makes the same check. Otherwise the 26630Sstevel@tonic-gate * failback will fail. 26640Sstevel@tonic-gate */ 26650Sstevel@tonic-gate if ((from->pi_v4 != NULL && pi->pi_v4 == NULL) || 26660Sstevel@tonic-gate (from->pi_v6 != NULL && pi->pi_v6 == NULL)) { 26670Sstevel@tonic-gate partial = _B_TRUE; 26680Sstevel@tonic-gate continue; 26690Sstevel@tonic-gate } 26700Sstevel@tonic-gate 2671*2496Smeem pi->pi_empty = 0; /* Per state diagram */ 2672*2496Smeem attempted_failback = _B_TRUE; 2673*2496Smeem if (failback(from, pi) != 0) { 2674*2496Smeem done = _B_FALSE; 2675*2496Smeem break; 26760Sstevel@tonic-gate } 26770Sstevel@tonic-gate } 26780Sstevel@tonic-gate 26790Sstevel@tonic-gate /* 26800Sstevel@tonic-gate * We are done. No more phyint from which we can src the failback 26810Sstevel@tonic-gate */ 26820Sstevel@tonic-gate if (done) { 26830Sstevel@tonic-gate if (!partial) 26840Sstevel@tonic-gate pi->pi_full = 1; /* Per state diagram */ 26850Sstevel@tonic-gate /* 26860Sstevel@tonic-gate * Don't print out a message unless there is a 26870Sstevel@tonic-gate * transition from FAILED to RUNNING. For eg. 26880Sstevel@tonic-gate * we don't want to print out this message if a 26890Sstevel@tonic-gate * phyint is leaving the group, or at startup 26900Sstevel@tonic-gate */ 26910Sstevel@tonic-gate if (attempted_failback && (pi->pi_flags & 26920Sstevel@tonic-gate (IFF_FAILED | IFF_OFFLINE))) { 26930Sstevel@tonic-gate logerr("Successfully failed back to NIC %s\n", 26940Sstevel@tonic-gate pi->pi_name); 26950Sstevel@tonic-gate } 26960Sstevel@tonic-gate return (partial ? IPMP_EFBPARTIAL : IPMP_SUCCESS); 26970Sstevel@tonic-gate } 26980Sstevel@tonic-gate 26990Sstevel@tonic-gate return (IPMP_FAILURE); 27000Sstevel@tonic-gate } 27010Sstevel@tonic-gate 27020Sstevel@tonic-gate /* 27030Sstevel@tonic-gate * This function is similar to do_failback() above, but respects the 27040Sstevel@tonic-gate * failback_enabled flag for phyints in named groups. 27050Sstevel@tonic-gate */ 27060Sstevel@tonic-gate int 2707*2496Smeem try_failback(struct phyint *pi) 27080Sstevel@tonic-gate { 27090Sstevel@tonic-gate if (debug & D_FAILOVER) 27100Sstevel@tonic-gate logdebug("try_failback(%s)\n", pi->pi_name); 27110Sstevel@tonic-gate 27120Sstevel@tonic-gate if (pi->pi_group != phyint_anongroup && !failback_enabled) 27130Sstevel@tonic-gate return (IPMP_EFBDISABLED); 27140Sstevel@tonic-gate 2715*2496Smeem return (do_failback(pi)); 27160Sstevel@tonic-gate } 27170Sstevel@tonic-gate 27180Sstevel@tonic-gate /* 27190Sstevel@tonic-gate * Failback everything from phyint 'from' that has the same ifindex 27200Sstevel@tonic-gate * as phyint to's ifindex. 27210Sstevel@tonic-gate */ 27220Sstevel@tonic-gate static int 27230Sstevel@tonic-gate failback(struct phyint *from, struct phyint *to) 27240Sstevel@tonic-gate { 27250Sstevel@tonic-gate struct lifreq lifr; 27260Sstevel@tonic-gate int ret; 27270Sstevel@tonic-gate 27280Sstevel@tonic-gate if (debug & D_FAILOVER) 27290Sstevel@tonic-gate logdebug("failback(%s %s)\n", from->pi_name, to->pi_name); 27300Sstevel@tonic-gate 27310Sstevel@tonic-gate lifr.lifr_addr.ss_family = AF_UNSPEC; 27320Sstevel@tonic-gate (void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name)); 27330Sstevel@tonic-gate lifr.lifr_movetoindex = to->pi_ifindex; 27340Sstevel@tonic-gate 27350Sstevel@tonic-gate ret = ioctl(ifsock_v4, SIOCLIFFAILBACK, (caddr_t)&lifr); 27360Sstevel@tonic-gate if (ret < 0) { 27370Sstevel@tonic-gate global_errno = errno; 27380Sstevel@tonic-gate logperror("failback: ioctl (failback)"); 27390Sstevel@tonic-gate } 27400Sstevel@tonic-gate 27410Sstevel@tonic-gate /* 27420Sstevel@tonic-gate * Set full_scan_required to true. This will make us read 27430Sstevel@tonic-gate * the state from the kernel in initifs() and update our tables, 27440Sstevel@tonic-gate * to reflect the current state after the failback. If the 27450Sstevel@tonic-gate * failback has failed it will then reissue the failback. 27460Sstevel@tonic-gate */ 27470Sstevel@tonic-gate full_scan_required = _B_TRUE; 27480Sstevel@tonic-gate 27490Sstevel@tonic-gate return (ret); 27500Sstevel@tonic-gate } 27510Sstevel@tonic-gate 27520Sstevel@tonic-gate /* 27530Sstevel@tonic-gate * Select a target phyint for failing over from 'pi'. 27540Sstevel@tonic-gate * In the normal case i.e. failover_type is FAILOVER_NORMAL, the preferred 27550Sstevel@tonic-gate * target phyint is chosen as follows, 27560Sstevel@tonic-gate * 1. Pick any inactive standby interface. 27570Sstevel@tonic-gate * 2. If no inactive standby is available, select any phyint in the 27580Sstevel@tonic-gate * same group that has the least number of logints, (excluding 27590Sstevel@tonic-gate * IFF_NOFAILOVER and !IFF_UP logints) 27600Sstevel@tonic-gate * If we are failing over from a standby, failover_type is 27610Sstevel@tonic-gate * FAILOVER_TO_NONSTANDBY, and we won't pick a standby for the destination. 27620Sstevel@tonic-gate * If a phyint is leaving the group, then failover_type is FAILOVER_TO_ANY, 27630Sstevel@tonic-gate * and we won't return NULL, as long as there is at least 1 other phyint 27640Sstevel@tonic-gate * in the group. 27650Sstevel@tonic-gate */ 27660Sstevel@tonic-gate static struct phyint * 27670Sstevel@tonic-gate get_failover_dst(struct phyint *pi, int failover_type) 27680Sstevel@tonic-gate { 27690Sstevel@tonic-gate struct phyint *maybe = NULL; 27700Sstevel@tonic-gate struct phyint *pi2; 27710Sstevel@tonic-gate struct phyint *last_choice = NULL; 27720Sstevel@tonic-gate 27730Sstevel@tonic-gate if (pi->pi_group == phyint_anongroup) 27740Sstevel@tonic-gate return (NULL); 27750Sstevel@tonic-gate 27760Sstevel@tonic-gate /* 27770Sstevel@tonic-gate * Loop thru the phyints in the group, and pick the preferred 27780Sstevel@tonic-gate * phyint for the target. 27790Sstevel@tonic-gate */ 27800Sstevel@tonic-gate for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { 27810Sstevel@tonic-gate /* Exclude ourself and offlined interfaces */ 27820Sstevel@tonic-gate if (pi2 == pi || pi2->pi_state == PI_OFFLINE) 27830Sstevel@tonic-gate continue; 27840Sstevel@tonic-gate 27850Sstevel@tonic-gate /* 27860Sstevel@tonic-gate * The chosen target phyint must have IPv4 instance 27870Sstevel@tonic-gate * plumbed, if the src phyint has IPv4 plumbed. Similarly 27880Sstevel@tonic-gate * for IPv6. 27890Sstevel@tonic-gate */ 27900Sstevel@tonic-gate if ((pi2->pi_v4 == NULL && pi->pi_v4 != NULL) || 27910Sstevel@tonic-gate (pi2->pi_v6 == NULL && pi->pi_v6 != NULL)) 27920Sstevel@tonic-gate continue; 27930Sstevel@tonic-gate 27940Sstevel@tonic-gate /* The chosen target must be PI_RUNNING. */ 27950Sstevel@tonic-gate if (pi2->pi_state != PI_RUNNING) { 27960Sstevel@tonic-gate last_choice = pi2; 27970Sstevel@tonic-gate continue; 27980Sstevel@tonic-gate } 27990Sstevel@tonic-gate 2800704Sethindra if ((pi2->pi_flags & (IFF_STANDBY | IFF_INACTIVE)) && 28010Sstevel@tonic-gate (failover_type != FAILOVER_TO_NONSTANDBY)) { 28020Sstevel@tonic-gate return (pi2); 28030Sstevel@tonic-gate } else { 28040Sstevel@tonic-gate if (maybe == NULL) 28050Sstevel@tonic-gate maybe = pi2; 28060Sstevel@tonic-gate else if (logint_upcount(pi2) < logint_upcount(maybe)) 28070Sstevel@tonic-gate maybe = pi2; 28080Sstevel@tonic-gate } 28090Sstevel@tonic-gate } 28100Sstevel@tonic-gate if (maybe == NULL && failover_type == FAILOVER_TO_ANY) 28110Sstevel@tonic-gate return (last_choice); 28120Sstevel@tonic-gate else 28130Sstevel@tonic-gate return (maybe); 28140Sstevel@tonic-gate } 28150Sstevel@tonic-gate 28160Sstevel@tonic-gate /* 28170Sstevel@tonic-gate * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call. 28180Sstevel@tonic-gate */ 28190Sstevel@tonic-gate boolean_t 28200Sstevel@tonic-gate change_lif_flags(struct phyint *pi, uint64_t flags, boolean_t setfl) 28210Sstevel@tonic-gate { 28220Sstevel@tonic-gate int ifsock; 28230Sstevel@tonic-gate struct lifreq lifr; 28240Sstevel@tonic-gate 28250Sstevel@tonic-gate if (debug & D_FAILOVER) { 28260Sstevel@tonic-gate logdebug("change_lif_flags(%s): flags %llx setfl %d\n", 28270Sstevel@tonic-gate pi->pi_name, flags, (int)setfl); 28280Sstevel@tonic-gate } 28290Sstevel@tonic-gate 28300Sstevel@tonic-gate if (pi->pi_v4 != NULL) { 28310Sstevel@tonic-gate ifsock = ifsock_v4; 28320Sstevel@tonic-gate } else { 28330Sstevel@tonic-gate ifsock = ifsock_v6; 28340Sstevel@tonic-gate } 28350Sstevel@tonic-gate 28360Sstevel@tonic-gate /* 28370Sstevel@tonic-gate * Get the current flags from the kernel, and set/clear the 28380Sstevel@tonic-gate * desired phyint flags. Since we set only phyint flags, we can 28390Sstevel@tonic-gate * do it on either IPv4 or IPv6 instance. 28400Sstevel@tonic-gate */ 28410Sstevel@tonic-gate (void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); 28420Sstevel@tonic-gate lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; 28430Sstevel@tonic-gate if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) { 28440Sstevel@tonic-gate if (errno != ENXIO) 28450Sstevel@tonic-gate logperror("change_lif_flags: ioctl (get flags)"); 28460Sstevel@tonic-gate return (_B_FALSE); 28470Sstevel@tonic-gate } 28480Sstevel@tonic-gate if (setfl) 28490Sstevel@tonic-gate lifr.lifr_flags |= flags; 28500Sstevel@tonic-gate else 28510Sstevel@tonic-gate lifr.lifr_flags &= ~flags; 28520Sstevel@tonic-gate if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) { 28530Sstevel@tonic-gate if (errno != ENXIO) 28540Sstevel@tonic-gate logperror("change_lif_flags: ioctl (set flags)"); 28550Sstevel@tonic-gate return (_B_FALSE); 28560Sstevel@tonic-gate } 28570Sstevel@tonic-gate 28580Sstevel@tonic-gate /* 28590Sstevel@tonic-gate * Keep pi_flags in synch. with actual flags. Assumes flags are 28600Sstevel@tonic-gate * phyint flags. 28610Sstevel@tonic-gate */ 28620Sstevel@tonic-gate if (setfl) 28630Sstevel@tonic-gate pi->pi_flags |= flags; 28640Sstevel@tonic-gate else 28650Sstevel@tonic-gate pi->pi_flags &= ~flags; 28660Sstevel@tonic-gate 28670Sstevel@tonic-gate if (pi->pi_v4) 28680Sstevel@tonic-gate pi->pi_v4->pii_flags = pi->pi_flags; 28690Sstevel@tonic-gate 28700Sstevel@tonic-gate if (pi->pi_v6) 28710Sstevel@tonic-gate pi->pi_v6->pii_flags = pi->pi_flags; 28720Sstevel@tonic-gate 28730Sstevel@tonic-gate return (_B_TRUE); 28740Sstevel@tonic-gate } 28750Sstevel@tonic-gate 28760Sstevel@tonic-gate /* 28770Sstevel@tonic-gate * icmp cksum computation for IPv4. 28780Sstevel@tonic-gate */ 28790Sstevel@tonic-gate static int 28800Sstevel@tonic-gate in_cksum(ushort_t *addr, int len) 28810Sstevel@tonic-gate { 28820Sstevel@tonic-gate register int nleft = len; 28830Sstevel@tonic-gate register ushort_t *w = addr; 28840Sstevel@tonic-gate register ushort_t answer; 28850Sstevel@tonic-gate ushort_t odd_byte = 0; 28860Sstevel@tonic-gate register int sum = 0; 28870Sstevel@tonic-gate 28880Sstevel@tonic-gate /* 28890Sstevel@tonic-gate * Our algorithm is simple, using a 32 bit accumulator (sum), 28900Sstevel@tonic-gate * we add sequential 16 bit words to it, and at the end, fold 28910Sstevel@tonic-gate * back all the carry bits from the top 16 bits into the lower 28920Sstevel@tonic-gate * 16 bits. 28930Sstevel@tonic-gate */ 28940Sstevel@tonic-gate while (nleft > 1) { 28950Sstevel@tonic-gate sum += *w++; 28960Sstevel@tonic-gate nleft -= 2; 28970Sstevel@tonic-gate } 28980Sstevel@tonic-gate 28990Sstevel@tonic-gate /* mop up an odd byte, if necessary */ 29000Sstevel@tonic-gate if (nleft == 1) { 29010Sstevel@tonic-gate *(uchar_t *)(&odd_byte) = *(uchar_t *)w; 29020Sstevel@tonic-gate sum += odd_byte; 29030Sstevel@tonic-gate } 29040Sstevel@tonic-gate 29050Sstevel@tonic-gate /* 29060Sstevel@tonic-gate * add back carry outs from top 16 bits to low 16 bits 29070Sstevel@tonic-gate */ 29080Sstevel@tonic-gate sum = (sum >> 16) + (sum & 0xffff); /* add hi 16 to low 16 */ 29090Sstevel@tonic-gate sum += (sum >> 16); /* add carry */ 29100Sstevel@tonic-gate answer = ~sum; /* truncate to 16 bits */ 29110Sstevel@tonic-gate return (answer); 29120Sstevel@tonic-gate } 29130Sstevel@tonic-gate 29140Sstevel@tonic-gate static void 29150Sstevel@tonic-gate reset_snxt_basetimes(void) 29160Sstevel@tonic-gate { 29170Sstevel@tonic-gate struct phyint_instance *pii; 29180Sstevel@tonic-gate 29190Sstevel@tonic-gate for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 29200Sstevel@tonic-gate pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 29210Sstevel@tonic-gate } 29220Sstevel@tonic-gate } 29230Sstevel@tonic-gate 29240Sstevel@tonic-gate /* 29250Sstevel@tonic-gate * Is the address one of our own addresses? Unfortunately, 29260Sstevel@tonic-gate * we cannot check our phyint tables to determine if the address 29270Sstevel@tonic-gate * is our own. This is because, we don't track interfaces that 29280Sstevel@tonic-gate * are not part of any group. We have to either use a 'bind' or 29290Sstevel@tonic-gate * get the complete list of all interfaces using SIOCGLIFCONF, 29302250Srk129064 * to do this check. We could also use SIOCTMYADDR. 29312250Srk129064 * Bind fails for the local zone address, so we might include local zone 29322250Srk129064 * address as target address. If local zone address is a target address 29332250Srk129064 * and it is up, it is not possible to detect the interface failure. 29342250Srk129064 * SIOCTMYADDR also doesn't consider local zone address as own address. 29352250Srk129064 * So, we choose to use SIOCGLIFCONF to collect the local addresses, and they 29362250Srk129064 * are stored in laddr_list. 29370Sstevel@tonic-gate */ 29380Sstevel@tonic-gate 29392250Srk129064 boolean_t 29402250Srk129064 own_address(struct in6_addr addr) 29412250Srk129064 { 29422250Srk129064 struct local_addr *taddr = laddr_list; 29432250Srk129064 29442250Srk129064 for (; taddr != NULL; taddr = taddr->next) { 29452250Srk129064 if (IN6_ARE_ADDR_EQUAL(&addr, &taddr->addr)) { 29462250Srk129064 return (_B_TRUE); 29470Sstevel@tonic-gate } 29480Sstevel@tonic-gate } 29492250Srk129064 return (_B_FALSE); 29500Sstevel@tonic-gate } 2951