1*0Sstevel@tonic-gate /* 2*0Sstevel@tonic-gate * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 3*0Sstevel@tonic-gate * Use is subject to license terms. 4*0Sstevel@tonic-gate */ 5*0Sstevel@tonic-gate 6*0Sstevel@tonic-gate /* 7*0Sstevel@tonic-gate * Copyright (c) 1987 Regents of the University of California. 8*0Sstevel@tonic-gate * All rights reserved. 9*0Sstevel@tonic-gate * 10*0Sstevel@tonic-gate * Redistribution and use in source and binary forms are permitted 11*0Sstevel@tonic-gate * provided that the above copyright notice and this paragraph are 12*0Sstevel@tonic-gate * duplicated in all such forms and that any documentation, 13*0Sstevel@tonic-gate * advertising materials, and other materials related to such 14*0Sstevel@tonic-gate * distribution and use acknowledge that the software was developed 15*0Sstevel@tonic-gate * by the University of California, Berkeley. The name of the 16*0Sstevel@tonic-gate * University may not be used to endorse or promote products derived 17*0Sstevel@tonic-gate * from this software without specific prior written permission. 18*0Sstevel@tonic-gate * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 19*0Sstevel@tonic-gate * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 20*0Sstevel@tonic-gate * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. 21*0Sstevel@tonic-gate */ 22*0Sstevel@tonic-gate 23*0Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 24*0Sstevel@tonic-gate 25*0Sstevel@tonic-gate #include "mpd_defs.h" 26*0Sstevel@tonic-gate #include "mpd_tables.h" 27*0Sstevel@tonic-gate 28*0Sstevel@tonic-gate /* 29*0Sstevel@tonic-gate * Probe types for probe() 30*0Sstevel@tonic-gate */ 31*0Sstevel@tonic-gate #define PROBE_UNI 0x1234 /* Unicast probe packet */ 32*0Sstevel@tonic-gate #define PROBE_MULTI 0x5678 /* Multicast probe packet */ 33*0Sstevel@tonic-gate #define PROBE_RTT 0x9abc /* RTT only probe packet */ 34*0Sstevel@tonic-gate 35*0Sstevel@tonic-gate #define MSEC_PERMIN (60 * MILLISEC) /* Number of milliseconds in a minute */ 36*0Sstevel@tonic-gate 37*0Sstevel@tonic-gate /* 38*0Sstevel@tonic-gate * Format of probe / probe response packets. This is an ICMP Echo request 39*0Sstevel@tonic-gate * or ICMP Echo reply. Packet format is same for both IPv4 and IPv6 40*0Sstevel@tonic-gate */ 41*0Sstevel@tonic-gate struct pr_icmp 42*0Sstevel@tonic-gate { 43*0Sstevel@tonic-gate uint8_t pr_icmp_type; /* type field */ 44*0Sstevel@tonic-gate uint8_t pr_icmp_code; /* code field */ 45*0Sstevel@tonic-gate uint16_t pr_icmp_cksum; /* checksum field */ 46*0Sstevel@tonic-gate uint16_t pr_icmp_id; /* Identification */ 47*0Sstevel@tonic-gate uint16_t pr_icmp_seq; /* sequence number */ 48*0Sstevel@tonic-gate uint32_t pr_icmp_timestamp; /* Time stamp */ 49*0Sstevel@tonic-gate uint32_t pr_icmp_mtype; /* Message type */ 50*0Sstevel@tonic-gate }; 51*0Sstevel@tonic-gate 52*0Sstevel@tonic-gate static struct in6_addr all_nodes_mcast_v6 = { { 0xff, 0x2, 0x0, 0x0, 53*0Sstevel@tonic-gate 0x0, 0x0, 0x0, 0x0, 54*0Sstevel@tonic-gate 0x0, 0x0, 0x0, 0x0, 55*0Sstevel@tonic-gate 0x0, 0x0, 0x0, 0x1 } }; 56*0Sstevel@tonic-gate 57*0Sstevel@tonic-gate static struct in_addr all_nodes_mcast_v4 = { { { 0xe0, 0x0, 0x0, 0x1 } } }; 58*0Sstevel@tonic-gate 59*0Sstevel@tonic-gate static hrtime_t last_fdt_bumpup_time; /* When FDT was bumped up last */ 60*0Sstevel@tonic-gate 61*0Sstevel@tonic-gate static void *find_ancillary(struct msghdr *msg, int cmsg_type); 62*0Sstevel@tonic-gate static void pi_set_crtt(struct target *tg, int m, 63*0Sstevel@tonic-gate boolean_t is_probe_uni); 64*0Sstevel@tonic-gate static void incoming_echo_reply(struct phyint_instance *pii, 65*0Sstevel@tonic-gate struct pr_icmp *reply, struct in6_addr fromaddr); 66*0Sstevel@tonic-gate static void incoming_rtt_reply(struct phyint_instance *pii, 67*0Sstevel@tonic-gate struct pr_icmp *reply, struct in6_addr fromaddr); 68*0Sstevel@tonic-gate static void incoming_mcast_reply(struct phyint_instance *pii, 69*0Sstevel@tonic-gate struct pr_icmp *reply, struct in6_addr fromaddr); 70*0Sstevel@tonic-gate 71*0Sstevel@tonic-gate static boolean_t check_pg_crtt_improved(struct phyint_group *pg); 72*0Sstevel@tonic-gate static boolean_t check_pii_crtt_improved(struct phyint_instance *pii); 73*0Sstevel@tonic-gate static boolean_t check_exception_target(struct phyint_instance *pii, 74*0Sstevel@tonic-gate struct target *target); 75*0Sstevel@tonic-gate static void probe_fail_info(struct phyint_instance *pii, 76*0Sstevel@tonic-gate struct target *cur_tg, struct probe_fail_count *pfinfo); 77*0Sstevel@tonic-gate static void probe_success_info(struct phyint_instance *pii, 78*0Sstevel@tonic-gate struct target *cur_tg, struct probe_success_count *psinfo); 79*0Sstevel@tonic-gate static boolean_t phyint_repaired(struct phyint *pi); 80*0Sstevel@tonic-gate 81*0Sstevel@tonic-gate static int failover(struct phyint *from, struct phyint *to); 82*0Sstevel@tonic-gate static int failback(struct phyint *from, struct phyint *to); 83*0Sstevel@tonic-gate static struct phyint *get_failover_dst(struct phyint *pi, int failover_type); 84*0Sstevel@tonic-gate 85*0Sstevel@tonic-gate static boolean_t highest_ack_tg(uint16_t seq, struct target *tg); 86*0Sstevel@tonic-gate static int in_cksum(ushort_t *addr, int len); 87*0Sstevel@tonic-gate static void reset_snxt_basetimes(void); 88*0Sstevel@tonic-gate 89*0Sstevel@tonic-gate /* 90*0Sstevel@tonic-gate * CRTT - Conservative Round Trip Time Estimate 91*0Sstevel@tonic-gate * Probe success - A matching probe reply received before CRTT ms has elapsed 92*0Sstevel@tonic-gate * after sending the probe. 93*0Sstevel@tonic-gate * Probe failure - No probe reply received and more than CRTT ms has elapsed 94*0Sstevel@tonic-gate * after sending the probe. 95*0Sstevel@tonic-gate * 96*0Sstevel@tonic-gate * TLS - Time last success. Most recent probe ack received at this time. 97*0Sstevel@tonic-gate * TFF - Time first fail. The time of the earliest probe failure in 98*0Sstevel@tonic-gate * a consecutive series of probe failures. 99*0Sstevel@tonic-gate * NUM_PROBE_REPAIRS - Number of consecutive successful probes required 100*0Sstevel@tonic-gate * before declaring phyint repair. 101*0Sstevel@tonic-gate * NUM_PROBE_FAILS - Number of consecutive probe failures required to 102*0Sstevel@tonic-gate * declare a phyint failure. 103*0Sstevel@tonic-gate * 104*0Sstevel@tonic-gate * Phyint state diagram 105*0Sstevel@tonic-gate * 106*0Sstevel@tonic-gate * The state of a phyint that is capable of being probed, is completely 107*0Sstevel@tonic-gate * specified by the 5-tuple <pi_state, pg_groupfailed, I, pi_empty, pi_full>. 108*0Sstevel@tonic-gate * 109*0Sstevel@tonic-gate * A phyint starts in either PI_RUNNING or PI_FAILED, depending on the state 110*0Sstevel@tonic-gate * of the link (according to the driver). If the phyint is also configured 111*0Sstevel@tonic-gate * with a test address (the common case) and probe targets, then a phyint must 112*0Sstevel@tonic-gate * also successfully be able to send and receive probes in order to remain in 113*0Sstevel@tonic-gate * the PI_RUNNING state (otherwise, it transitions to PI_FAILED). 114*0Sstevel@tonic-gate * 115*0Sstevel@tonic-gate * Further, if a PI_RUNNING phyint is configured with a test address but is 116*0Sstevel@tonic-gate * unable to find any probe targets, it will transition to the PI_NOTARGETS 117*0Sstevel@tonic-gate * state, which indicates that the link is apparently functional but that 118*0Sstevel@tonic-gate * in.mpathd is unable to send probes to verify functionality (in this case, 119*0Sstevel@tonic-gate * in.mpathd makes the optimistic assumption that the interface is working 120*0Sstevel@tonic-gate * correctly and thus does not perform a failover, but reports the interface 121*0Sstevel@tonic-gate * as IPMP_IF_UNKNOWN through the async events and query interfaces). 122*0Sstevel@tonic-gate * 123*0Sstevel@tonic-gate * At any point, a phyint may be administratively marked offline via if_mpadm. 124*0Sstevel@tonic-gate * In this case, the interface always transitions to PI_OFFLINE, regardless 125*0Sstevel@tonic-gate * of its previous state. When the interface is later brought back online, 126*0Sstevel@tonic-gate * in.mpathd acts as if the interface is new (and thus it transitions to 127*0Sstevel@tonic-gate * PI_RUNNING or PI_FAILED based on the status of the link and the result of 128*0Sstevel@tonic-gate * its probes, if probes are sent). 129*0Sstevel@tonic-gate * 130*0Sstevel@tonic-gate * pi_state - PI_RUNNING or PI_FAILED 131*0Sstevel@tonic-gate * PI_RUNNING: The failure detection logic says the phyint is good. 132*0Sstevel@tonic-gate * PI_FAILED: The failure detection logic says the phyint has failed. 133*0Sstevel@tonic-gate * 134*0Sstevel@tonic-gate * pg_groupfailed - Group failure, all interfaces in the group have failed. 135*0Sstevel@tonic-gate * The pi_state may be either PI_FAILED or PI_NOTARGETS. 136*0Sstevel@tonic-gate * In the case of router targets, we assume that the current list of 137*0Sstevel@tonic-gate * targets obtained from the routing table, is still valid, so the 138*0Sstevel@tonic-gate * phyint stat is PI_FAILED. In the case of host targets, we delete the 139*0Sstevel@tonic-gate * list of targets, and multicast to the all hosts, to reconstruct the 140*0Sstevel@tonic-gate * target list. So the phyints are in the PI_NOTARGETS state. 141*0Sstevel@tonic-gate * 142*0Sstevel@tonic-gate * I - value of (pi_flags & IFF_INACTIVE) 143*0Sstevel@tonic-gate * IFF_INACTIVE: No failovers have been done to the standby, from 144*0Sstevel@tonic-gate * other phyints. This phyint is an inactive standby. 145*0Sstevel@tonic-gate * 146*0Sstevel@tonic-gate * pi_empty 147*0Sstevel@tonic-gate * This phyint has failed over successfully to another phyint, and 148*0Sstevel@tonic-gate * this phyint is currently "empty". It does not host any addresses or 149*0Sstevel@tonic-gate * multicast membership etc. This is the state of a phyint after a 150*0Sstevel@tonic-gate * failover from the phyint has completed successfully and no subsequent 151*0Sstevel@tonic-gate * 'failover to' or 'failback to' has occurred on the phyint. 152*0Sstevel@tonic-gate * IP guarantees that no new logicals will be hosted nor any multicast 153*0Sstevel@tonic-gate * joins permitted on the phyint, since the phyint is either failed or 154*0Sstevel@tonic-gate * inactive. pi_empty is set implies the phyint is either failed or 155*0Sstevel@tonic-gate * inactive. 156*0Sstevel@tonic-gate * 157*0Sstevel@tonic-gate * pi_full 158*0Sstevel@tonic-gate * The phyint hosts all of its own addresses that it "owns". If the 159*0Sstevel@tonic-gate * phyint was previously failed or inactive, failbacks to the phyint 160*0Sstevel@tonic-gate * has completed successfully. i.e. No more failbacks to this phyint 161*0Sstevel@tonic-gate * can produce any change in system state whatsoever. 162*0Sstevel@tonic-gate * 163*0Sstevel@tonic-gate * Not all 32 possible combinations of the above 5-tuple are possible. 164*0Sstevel@tonic-gate * Furthermore some of the above combinations are transient. They may occur 165*0Sstevel@tonic-gate * only because the failover or failback did not complete successfully. The 166*0Sstevel@tonic-gate * failover/failback will be retried and eventually a stable state will be 167*0Sstevel@tonic-gate * reached. 168*0Sstevel@tonic-gate * 169*0Sstevel@tonic-gate * I is tracked by IP. pi_state, pi_empty and pi_full are tracked by mpathd. 170*0Sstevel@tonic-gate * The following are the state machines. 'from' and 'to' are the src and 171*0Sstevel@tonic-gate * dst of the failover/failback, below 172*0Sstevel@tonic-gate * 173*0Sstevel@tonic-gate * pi_empty state machine 174*0Sstevel@tonic-gate * --------------------------------------------------------------------------- 175*0Sstevel@tonic-gate * Event State -> New State 176*0Sstevel@tonic-gate * --------------------------------------------------------------------------- 177*0Sstevel@tonic-gate * successful completion from.pi_empty = 0 -> from.pi_empty = 1 178*0Sstevel@tonic-gate * of failover 179*0Sstevel@tonic-gate * 180*0Sstevel@tonic-gate * Initiate failover to.pi_empty = X -> to.pi_empty = 0 181*0Sstevel@tonic-gate * 182*0Sstevel@tonic-gate * Initiate failback to.pi_empty = X -> to.pi_empty = 0 183*0Sstevel@tonic-gate * 184*0Sstevel@tonic-gate * group failure pi_empty = X -> pi_empty = 0 185*0Sstevel@tonic-gate * --------------------------------------------------------------------------- 186*0Sstevel@tonic-gate * 187*0Sstevel@tonic-gate * pi_full state machine 188*0Sstevel@tonic-gate * --------------------------------------------------------------------------- 189*0Sstevel@tonic-gate * Event State -> New State 190*0Sstevel@tonic-gate * --------------------------------------------------------------------------- 191*0Sstevel@tonic-gate * successful completion to.pi_full = 0 -> to.pi_full = 1 192*0Sstevel@tonic-gate * of failback from 193*0Sstevel@tonic-gate * each of the other phyints 194*0Sstevel@tonic-gate * 195*0Sstevel@tonic-gate * Initiate failover from.pi_full = X -> from.pi_full = 0 196*0Sstevel@tonic-gate * 197*0Sstevel@tonic-gate * group failure pi_full = X -> pi_full = 0 198*0Sstevel@tonic-gate * --------------------------------------------------------------------------- 199*0Sstevel@tonic-gate * 200*0Sstevel@tonic-gate * pi_state state machine 201*0Sstevel@tonic-gate * --------------------------------------------------------------------------- 202*0Sstevel@tonic-gate * Event State New State 203*0Sstevel@tonic-gate * Action: 204*0Sstevel@tonic-gate * --------------------------------------------------------------------------- 205*0Sstevel@tonic-gate * NIC failure (PI_RUNNING, I == 0) -> (PI_FAILED, I == 0) 206*0Sstevel@tonic-gate * detection : set IFF_FAILED on this phyint 207*0Sstevel@tonic-gate * : failover from this phyint to another 208*0Sstevel@tonic-gate * 209*0Sstevel@tonic-gate * NIC failure (PI_RUNNING, I == 1) -> (PI_FAILED, I == 1) 210*0Sstevel@tonic-gate * detection : set IFF_FAILED on this phyint 211*0Sstevel@tonic-gate * 212*0Sstevel@tonic-gate * NIC repair (PI_FAILED, I == 0) -> (PI_RUNNING, I == 0) 213*0Sstevel@tonic-gate * detection : to.pi_empty = 0 214*0Sstevel@tonic-gate * : failback to this phyint if enabled 215*0Sstevel@tonic-gate * : clear IFF_FAILED on this phyint 216*0Sstevel@tonic-gate * 217*0Sstevel@tonic-gate * NIC repair (PI_FAILED, I == 1) -> (PI_RUNNING, I == 1) 218*0Sstevel@tonic-gate * detection : clear IFF_FAILED on this phyint 219*0Sstevel@tonic-gate * 220*0Sstevel@tonic-gate * Group failure (perform on all phyints in the group) 221*0Sstevel@tonic-gate * detection PI_RUNNING PI_FAILED 222*0Sstevel@tonic-gate * (Router targets) : set IFF_FAILED 223*0Sstevel@tonic-gate * : clear pi_empty and pi_full 224*0Sstevel@tonic-gate * 225*0Sstevel@tonic-gate * Group failure (perform on all phyints in the group) 226*0Sstevel@tonic-gate * detection PI_RUNNING PI_NOTARGETS 227*0Sstevel@tonic-gate * (Host targets) : set IFF_FAILED 228*0Sstevel@tonic-gate * : clear pi_empty and pi_full 229*0Sstevel@tonic-gate * : delete the target list on all phyints 230*0Sstevel@tonic-gate * --------------------------------------------------------------------------- 231*0Sstevel@tonic-gate * 232*0Sstevel@tonic-gate * I state machine 233*0Sstevel@tonic-gate * --------------------------------------------------------------------------- 234*0Sstevel@tonic-gate * Event State Action: 235*0Sstevel@tonic-gate * --------------------------------------------------------------------------- 236*0Sstevel@tonic-gate * Turn on I pi_empty == 0 : failover from standby 237*0Sstevel@tonic-gate * 238*0Sstevel@tonic-gate * Turn off I PI_RUNNING, : pi_empty = 0 239*0Sstevel@tonic-gate * pi_full == 0 : failback to this if enabled 240*0Sstevel@tonic-gate * --------------------------------------------------------------------------- 241*0Sstevel@tonic-gate * 242*0Sstevel@tonic-gate * Assertions: (Read '==>' as implies) 243*0Sstevel@tonic-gate * 244*0Sstevel@tonic-gate * (pi_empty == 1) ==> (I == 1 || pi_state == PI_FAILED) 245*0Sstevel@tonic-gate * (pi_empty == 1) ==> (pi_full == 0) 246*0Sstevel@tonic-gate * (pi_full == 1) ==> (pi_empty == 0) 247*0Sstevel@tonic-gate * 248*0Sstevel@tonic-gate * Invariants 249*0Sstevel@tonic-gate * 250*0Sstevel@tonic-gate * pg_groupfailed = 0 && 251*0Sstevel@tonic-gate * 1. (I == 1, pi_empty == 0) ==> initiate failover from standby 252*0Sstevel@tonic-gate * 2. (I == 0, PI_FAILED, pi_empty == 0) ==> initiate failover from phyint 253*0Sstevel@tonic-gate * 3. (I == 0, PI_RUNNING, pi_full == 0) ==> initiate failback to phyint 254*0Sstevel@tonic-gate * 255*0Sstevel@tonic-gate * 1. says that an inactive standby, that is not empty, has to be failed 256*0Sstevel@tonic-gate * over. For a standby to be truly inactive, it should not host any 257*0Sstevel@tonic-gate * addresses. So we move them to some other phyint. Usually we catch the 258*0Sstevel@tonic-gate * turn on of IFF_INACTIVE, and perform this action. However if the failover 259*0Sstevel@tonic-gate * did not complete successfully, then subsequently we have lost the edge 260*0Sstevel@tonic-gate * trigger, and this invariant kicks in and completes the action. 261*0Sstevel@tonic-gate * 262*0Sstevel@tonic-gate * 2. says that any failed phyint that is not empty must be failed over. 263*0Sstevel@tonic-gate * Usually we do the failover when we detect NIC failure. However if the 264*0Sstevel@tonic-gate * failover does not complete successfully, this invariant kicks in and 265*0Sstevel@tonic-gate * completes the failover. We exclude inactive standby which is covered by 1. 266*0Sstevel@tonic-gate * 267*0Sstevel@tonic-gate * 3. says that any running phyint that is not full must be failed back. 268*0Sstevel@tonic-gate * Usually we do the failback when we detect NIC repair. However if the 269*0Sstevel@tonic-gate * failback does not complete successfully, this invariant kicks in and 270*0Sstevel@tonic-gate * completes the failback. Note that we don't want to failback to an inactive 271*0Sstevel@tonic-gate * standby. 272*0Sstevel@tonic-gate * 273*0Sstevel@tonic-gate * The invariants 1 - 3 and the actions are in initifs(). 274*0Sstevel@tonic-gate */ 275*0Sstevel@tonic-gate 276*0Sstevel@tonic-gate struct probes_missed probes_missed; 277*0Sstevel@tonic-gate 278*0Sstevel@tonic-gate /* 279*0Sstevel@tonic-gate * Compose and transmit an ICMP ECHO REQUEST packet. The IP header 280*0Sstevel@tonic-gate * will be added on by the kernel. The id field identifies this phyint. 281*0Sstevel@tonic-gate * and the sequence number is an increasing (modulo 2^^16) integer. The data 282*0Sstevel@tonic-gate * portion holds the time value when the packet is sent. On echo this is 283*0Sstevel@tonic-gate * extracted to compute the round-trip time. Three different types of 284*0Sstevel@tonic-gate * probe packets are used. 285*0Sstevel@tonic-gate * 286*0Sstevel@tonic-gate * PROBE_UNI: This type is used to do failure detection / failure recovery 287*0Sstevel@tonic-gate * and RTT calculation. PROBE_UNI probes are spaced apart in time, 288*0Sstevel@tonic-gate * not less than the current CRTT. pii_probes[] stores data 289*0Sstevel@tonic-gate * about these probes. These packets consume sequence number space. 290*0Sstevel@tonic-gate * 291*0Sstevel@tonic-gate * PROBE_RTT: This type is used to make only rtt measurments. Normally these 292*0Sstevel@tonic-gate * are not used. Under heavy network load, the rtt may go up very high, 293*0Sstevel@tonic-gate * due to a spike, or may appear to go high, due to extreme scheduling 294*0Sstevel@tonic-gate * delays. Once the network stress is removed, mpathd takes long time to 295*0Sstevel@tonic-gate * recover, because the probe_interval is already high, and it takes 296*0Sstevel@tonic-gate * a long time to send out sufficient number of probes to bring down the 297*0Sstevel@tonic-gate * rtt. To avoid this problem, PROBE_RTT probes are sent out every 298*0Sstevel@tonic-gate * user_probe_interval ms. and will cause only rtt updates. These packets 299*0Sstevel@tonic-gate * do not consume sequence number space nor is information about these 300*0Sstevel@tonic-gate * packets stored in the pii_probes[] 301*0Sstevel@tonic-gate * 302*0Sstevel@tonic-gate * PROBE_MULTI: This type is only used to construct a list of targets, when 303*0Sstevel@tonic-gate * no targets are known. The packet is multicast to the all hosts addr. 304*0Sstevel@tonic-gate */ 305*0Sstevel@tonic-gate static void 306*0Sstevel@tonic-gate probe(struct phyint_instance *pii, uint_t probe_type, uint_t cur_time) 307*0Sstevel@tonic-gate { 308*0Sstevel@tonic-gate struct pr_icmp probe_pkt; /* Probe packet */ 309*0Sstevel@tonic-gate struct sockaddr_in6 whereto6; /* target address IPv6 */ 310*0Sstevel@tonic-gate struct sockaddr_in whereto; /* target address IPv4 */ 311*0Sstevel@tonic-gate int pr_ndx; /* probe index in pii->pii_probes[] */ 312*0Sstevel@tonic-gate boolean_t sent = _B_TRUE; 313*0Sstevel@tonic-gate 314*0Sstevel@tonic-gate if (debug & D_TARGET) { 315*0Sstevel@tonic-gate logdebug("probe(%s %s %d %u)\n", AF_STR(pii->pii_af), 316*0Sstevel@tonic-gate pii->pii_name, probe_type, cur_time); 317*0Sstevel@tonic-gate } 318*0Sstevel@tonic-gate 319*0Sstevel@tonic-gate assert(pii->pii_probe_sock != -1); 320*0Sstevel@tonic-gate assert(probe_type == PROBE_UNI || probe_type == PROBE_MULTI || 321*0Sstevel@tonic-gate probe_type == PROBE_RTT); 322*0Sstevel@tonic-gate 323*0Sstevel@tonic-gate probe_pkt.pr_icmp_type = (pii->pii_af == AF_INET) ? 324*0Sstevel@tonic-gate ICMP_ECHO_REQUEST : ICMP6_ECHO_REQUEST; 325*0Sstevel@tonic-gate probe_pkt.pr_icmp_code = 0; 326*0Sstevel@tonic-gate probe_pkt.pr_icmp_cksum = 0; 327*0Sstevel@tonic-gate probe_pkt.pr_icmp_seq = htons(pii->pii_snxt); 328*0Sstevel@tonic-gate 329*0Sstevel@tonic-gate /* 330*0Sstevel@tonic-gate * Since there is no need to do arithmetic on the icmpid, 331*0Sstevel@tonic-gate * (only equality check is done) pii_icmpid is stored in 332*0Sstevel@tonic-gate * network byte order at initialization itself. 333*0Sstevel@tonic-gate */ 334*0Sstevel@tonic-gate probe_pkt.pr_icmp_id = pii->pii_icmpid; 335*0Sstevel@tonic-gate probe_pkt.pr_icmp_timestamp = htonl(cur_time); 336*0Sstevel@tonic-gate probe_pkt.pr_icmp_mtype = htonl(probe_type); 337*0Sstevel@tonic-gate 338*0Sstevel@tonic-gate /* 339*0Sstevel@tonic-gate * If probe_type is PROBE_MULTI, this packet will be multicast to 340*0Sstevel@tonic-gate * the all hosts address. Otherwise it is unicast to the next target. 341*0Sstevel@tonic-gate */ 342*0Sstevel@tonic-gate assert(probe_type == PROBE_MULTI || ((pii->pii_target_next != NULL) && 343*0Sstevel@tonic-gate pii->pii_rtt_target_next != NULL)); 344*0Sstevel@tonic-gate 345*0Sstevel@tonic-gate if (pii->pii_af == AF_INET6) { 346*0Sstevel@tonic-gate bzero(&whereto6, sizeof (whereto6)); 347*0Sstevel@tonic-gate whereto6.sin6_family = AF_INET6; 348*0Sstevel@tonic-gate if (probe_type == PROBE_MULTI) { 349*0Sstevel@tonic-gate whereto6.sin6_addr = all_nodes_mcast_v6; 350*0Sstevel@tonic-gate } else if (probe_type == PROBE_UNI) { 351*0Sstevel@tonic-gate whereto6.sin6_addr = pii->pii_target_next->tg_address; 352*0Sstevel@tonic-gate } else { 353*0Sstevel@tonic-gate /* type is PROBE_RTT */ 354*0Sstevel@tonic-gate whereto6.sin6_addr = 355*0Sstevel@tonic-gate pii->pii_rtt_target_next->tg_address; 356*0Sstevel@tonic-gate } 357*0Sstevel@tonic-gate if (sendto(pii->pii_probe_sock, (char *)&probe_pkt, 358*0Sstevel@tonic-gate sizeof (probe_pkt), 0, (struct sockaddr *)&whereto6, 359*0Sstevel@tonic-gate sizeof (whereto6)) != sizeof (probe_pkt)) { 360*0Sstevel@tonic-gate logperror_pii(pii, "probe: probe sendto"); 361*0Sstevel@tonic-gate sent = _B_FALSE; 362*0Sstevel@tonic-gate } 363*0Sstevel@tonic-gate } else { 364*0Sstevel@tonic-gate bzero(&whereto, sizeof (whereto)); 365*0Sstevel@tonic-gate whereto.sin_family = AF_INET; 366*0Sstevel@tonic-gate if (probe_type == PROBE_MULTI) { 367*0Sstevel@tonic-gate whereto.sin_addr = all_nodes_mcast_v4; 368*0Sstevel@tonic-gate } else if (probe_type == PROBE_UNI) { 369*0Sstevel@tonic-gate IN6_V4MAPPED_TO_INADDR( 370*0Sstevel@tonic-gate &pii->pii_target_next->tg_address, 371*0Sstevel@tonic-gate &whereto.sin_addr); 372*0Sstevel@tonic-gate } else { 373*0Sstevel@tonic-gate /* type is PROBE_RTT */ 374*0Sstevel@tonic-gate IN6_V4MAPPED_TO_INADDR( 375*0Sstevel@tonic-gate &pii->pii_rtt_target_next->tg_address, 376*0Sstevel@tonic-gate &whereto.sin_addr); 377*0Sstevel@tonic-gate } 378*0Sstevel@tonic-gate 379*0Sstevel@tonic-gate /* 380*0Sstevel@tonic-gate * Compute the IPv4 icmp checksum. Does not cover the IP header. 381*0Sstevel@tonic-gate */ 382*0Sstevel@tonic-gate probe_pkt.pr_icmp_cksum = 383*0Sstevel@tonic-gate in_cksum((ushort_t *)&probe_pkt, (int)sizeof (probe_pkt)); 384*0Sstevel@tonic-gate if (sendto(pii->pii_probe_sock, (char *)&probe_pkt, 385*0Sstevel@tonic-gate sizeof (probe_pkt), 0, (struct sockaddr *)&whereto, 386*0Sstevel@tonic-gate sizeof (whereto)) != sizeof (probe_pkt)) { 387*0Sstevel@tonic-gate logperror_pii(pii, "probe: probe sendto"); 388*0Sstevel@tonic-gate sent = _B_FALSE; 389*0Sstevel@tonic-gate } 390*0Sstevel@tonic-gate } 391*0Sstevel@tonic-gate 392*0Sstevel@tonic-gate /* 393*0Sstevel@tonic-gate * If this is a PROBE_UNI probe packet being unicast to a target, then 394*0Sstevel@tonic-gate * update our tables. We will need this info in processing the probe 395*0Sstevel@tonic-gate * response. PROBE_MULTI and PROBE_RTT packets are not used for 396*0Sstevel@tonic-gate * the purpose of failure or recovery detection. PROBE_MULTI packets 397*0Sstevel@tonic-gate * are only used to construct a list of targets. PROBE_RTT packets are 398*0Sstevel@tonic-gate * used only for updating the rtt and not for failure detection. 399*0Sstevel@tonic-gate */ 400*0Sstevel@tonic-gate if (probe_type == PROBE_UNI && sent) { 401*0Sstevel@tonic-gate pr_ndx = pii->pii_probe_next; 402*0Sstevel@tonic-gate assert(pr_ndx >= 0 && pr_ndx < PROBE_STATS_COUNT); 403*0Sstevel@tonic-gate 404*0Sstevel@tonic-gate /* Collect statistics, before we reuse the last slot. */ 405*0Sstevel@tonic-gate if (pii->pii_probes[pr_ndx].pr_status == PR_LOST) 406*0Sstevel@tonic-gate pii->pii_cum_stats.lost++; 407*0Sstevel@tonic-gate else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) 408*0Sstevel@tonic-gate pii->pii_cum_stats.acked++; 409*0Sstevel@tonic-gate pii->pii_cum_stats.sent++; 410*0Sstevel@tonic-gate 411*0Sstevel@tonic-gate pii->pii_probes[pr_ndx].pr_status = PR_UNACKED; 412*0Sstevel@tonic-gate pii->pii_probes[pr_ndx].pr_target = pii->pii_target_next; 413*0Sstevel@tonic-gate pii->pii_probes[pr_ndx].pr_time_sent = cur_time; 414*0Sstevel@tonic-gate pii->pii_probe_next = PROBE_INDEX_NEXT(pii->pii_probe_next); 415*0Sstevel@tonic-gate pii->pii_target_next = target_next(pii->pii_target_next); 416*0Sstevel@tonic-gate assert(pii->pii_target_next != NULL); 417*0Sstevel@tonic-gate /* 418*0Sstevel@tonic-gate * If we have a single variable to denote the next target to 419*0Sstevel@tonic-gate * probe for both rtt probes and failure detection probes, we 420*0Sstevel@tonic-gate * could end up with a situation where the failure detection 421*0Sstevel@tonic-gate * probe targets become disjoint from the rtt probe targets. 422*0Sstevel@tonic-gate * Eg. if 2 targets and the actual fdt is double the user 423*0Sstevel@tonic-gate * specified fdt. So we have 2 variables. In this scheme 424*0Sstevel@tonic-gate * we also reset pii_rtt_target_next for every fdt probe, 425*0Sstevel@tonic-gate * though that may not be necessary. 426*0Sstevel@tonic-gate */ 427*0Sstevel@tonic-gate pii->pii_rtt_target_next = pii->pii_target_next; 428*0Sstevel@tonic-gate pii->pii_snxt++; 429*0Sstevel@tonic-gate } else if (probe_type == PROBE_RTT) { 430*0Sstevel@tonic-gate pii->pii_rtt_target_next = 431*0Sstevel@tonic-gate target_next(pii->pii_rtt_target_next); 432*0Sstevel@tonic-gate assert(pii->pii_rtt_target_next != NULL); 433*0Sstevel@tonic-gate } 434*0Sstevel@tonic-gate } 435*0Sstevel@tonic-gate 436*0Sstevel@tonic-gate /* 437*0Sstevel@tonic-gate * Incoming IPv4 data from wire, is received here. Called from main. 438*0Sstevel@tonic-gate */ 439*0Sstevel@tonic-gate void 440*0Sstevel@tonic-gate in_data(struct phyint_instance *pii) 441*0Sstevel@tonic-gate { 442*0Sstevel@tonic-gate struct sockaddr_in from; 443*0Sstevel@tonic-gate struct in6_addr fromaddr; 444*0Sstevel@tonic-gate uint_t fromlen; 445*0Sstevel@tonic-gate static uint_t in_packet[(IP_MAXPACKET + 1)/4]; 446*0Sstevel@tonic-gate struct ip *ip; 447*0Sstevel@tonic-gate int iphlen; 448*0Sstevel@tonic-gate int len; 449*0Sstevel@tonic-gate char abuf[INET_ADDRSTRLEN]; 450*0Sstevel@tonic-gate struct pr_icmp *reply; 451*0Sstevel@tonic-gate 452*0Sstevel@tonic-gate if (debug & D_PROBE) { 453*0Sstevel@tonic-gate logdebug("in_data(%s %s)\n", 454*0Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name); 455*0Sstevel@tonic-gate } 456*0Sstevel@tonic-gate 457*0Sstevel@tonic-gate /* 458*0Sstevel@tonic-gate * Poll has already told us that a message is waiting, 459*0Sstevel@tonic-gate * on this socket. Read it now. We should not block. 460*0Sstevel@tonic-gate */ 461*0Sstevel@tonic-gate fromlen = sizeof (from); 462*0Sstevel@tonic-gate len = recvfrom(pii->pii_probe_sock, (char *)in_packet, 463*0Sstevel@tonic-gate sizeof (in_packet), 0, (struct sockaddr *)&from, &fromlen); 464*0Sstevel@tonic-gate if (len < 0) { 465*0Sstevel@tonic-gate logperror_pii(pii, "in_data: recvfrom"); 466*0Sstevel@tonic-gate return; 467*0Sstevel@tonic-gate } 468*0Sstevel@tonic-gate 469*0Sstevel@tonic-gate /* 470*0Sstevel@tonic-gate * If the NIC has indicated the link is down, don't go 471*0Sstevel@tonic-gate * any further. 472*0Sstevel@tonic-gate */ 473*0Sstevel@tonic-gate if (LINK_DOWN(pii->pii_phyint)) 474*0Sstevel@tonic-gate return; 475*0Sstevel@tonic-gate 476*0Sstevel@tonic-gate /* Get the printable address for error reporting */ 477*0Sstevel@tonic-gate (void) inet_ntop(AF_INET, &from.sin_addr, abuf, sizeof (abuf)); 478*0Sstevel@tonic-gate 479*0Sstevel@tonic-gate /* Make sure packet contains at least minimum ICMP header */ 480*0Sstevel@tonic-gate ip = (struct ip *)in_packet; 481*0Sstevel@tonic-gate iphlen = ip->ip_hl << 2; 482*0Sstevel@tonic-gate if (len < iphlen + ICMP_MINLEN) { 483*0Sstevel@tonic-gate if (debug & D_PKTBAD) { 484*0Sstevel@tonic-gate logdebug("in_data: packet too short (%d bytes)" 485*0Sstevel@tonic-gate " from %s\n", len, abuf); 486*0Sstevel@tonic-gate } 487*0Sstevel@tonic-gate return; 488*0Sstevel@tonic-gate } 489*0Sstevel@tonic-gate 490*0Sstevel@tonic-gate /* 491*0Sstevel@tonic-gate * Subtract the IP hdr length, 'len' will be length of the probe 492*0Sstevel@tonic-gate * reply, starting from the icmp hdr. 493*0Sstevel@tonic-gate */ 494*0Sstevel@tonic-gate len -= iphlen; 495*0Sstevel@tonic-gate /* LINTED */ 496*0Sstevel@tonic-gate reply = (struct pr_icmp *)((char *)in_packet + iphlen); 497*0Sstevel@tonic-gate 498*0Sstevel@tonic-gate /* Probe replies are icmp echo replies. Ignore anything else */ 499*0Sstevel@tonic-gate if (reply->pr_icmp_type != ICMP_ECHO_REPLY) 500*0Sstevel@tonic-gate return; 501*0Sstevel@tonic-gate 502*0Sstevel@tonic-gate /* 503*0Sstevel@tonic-gate * The icmp id should match what we sent, which is stored 504*0Sstevel@tonic-gate * in pi_icmpid. The icmp code for reply must be 0. 505*0Sstevel@tonic-gate * The reply content must be a struct pr_icmp 506*0Sstevel@tonic-gate */ 507*0Sstevel@tonic-gate if (reply->pr_icmp_id != pii->pii_icmpid) { 508*0Sstevel@tonic-gate /* Not in response to our probe */ 509*0Sstevel@tonic-gate return; 510*0Sstevel@tonic-gate } 511*0Sstevel@tonic-gate 512*0Sstevel@tonic-gate if (reply->pr_icmp_code != 0) { 513*0Sstevel@tonic-gate logtrace("probe reply code %d from %s on %s\n", 514*0Sstevel@tonic-gate reply->pr_icmp_code, abuf, pii->pii_name); 515*0Sstevel@tonic-gate return; 516*0Sstevel@tonic-gate } 517*0Sstevel@tonic-gate 518*0Sstevel@tonic-gate if (len < sizeof (struct pr_icmp)) { 519*0Sstevel@tonic-gate logtrace("probe reply too short: %d bytes from %s on %s\n", 520*0Sstevel@tonic-gate len, abuf, pii->pii_name); 521*0Sstevel@tonic-gate return; 522*0Sstevel@tonic-gate } 523*0Sstevel@tonic-gate 524*0Sstevel@tonic-gate IN6_INADDR_TO_V4MAPPED(&from.sin_addr, &fromaddr); 525*0Sstevel@tonic-gate if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) 526*0Sstevel@tonic-gate /* Unicast probe reply */ 527*0Sstevel@tonic-gate incoming_echo_reply(pii, reply, fromaddr); 528*0Sstevel@tonic-gate else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { 529*0Sstevel@tonic-gate /* Multicast reply */ 530*0Sstevel@tonic-gate incoming_mcast_reply(pii, reply, fromaddr); 531*0Sstevel@tonic-gate } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) { 532*0Sstevel@tonic-gate incoming_rtt_reply(pii, reply, fromaddr); 533*0Sstevel@tonic-gate } else { 534*0Sstevel@tonic-gate /* Probably not in response to our probe */ 535*0Sstevel@tonic-gate logtrace("probe reply type: %d from %s on %s\n", 536*0Sstevel@tonic-gate reply->pr_icmp_mtype, abuf, pii->pii_name); 537*0Sstevel@tonic-gate return; 538*0Sstevel@tonic-gate } 539*0Sstevel@tonic-gate 540*0Sstevel@tonic-gate } 541*0Sstevel@tonic-gate 542*0Sstevel@tonic-gate /* 543*0Sstevel@tonic-gate * Incoming IPv6 data from wire is received here. Called from main. 544*0Sstevel@tonic-gate */ 545*0Sstevel@tonic-gate void 546*0Sstevel@tonic-gate in6_data(struct phyint_instance *pii) 547*0Sstevel@tonic-gate { 548*0Sstevel@tonic-gate struct sockaddr_in6 from; 549*0Sstevel@tonic-gate static uint64_t in_packet[(IP_MAXPACKET + 1)/8]; 550*0Sstevel@tonic-gate static uint64_t ancillary_data[(IP_MAXPACKET + 1)/8]; 551*0Sstevel@tonic-gate int len; 552*0Sstevel@tonic-gate char abuf[INET6_ADDRSTRLEN]; 553*0Sstevel@tonic-gate struct msghdr msg; 554*0Sstevel@tonic-gate struct iovec iov; 555*0Sstevel@tonic-gate uchar_t *opt; 556*0Sstevel@tonic-gate struct pr_icmp *reply; 557*0Sstevel@tonic-gate 558*0Sstevel@tonic-gate if (debug & D_PROBE) { 559*0Sstevel@tonic-gate logdebug("in6_data(%s %s)\n", 560*0Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name); 561*0Sstevel@tonic-gate } 562*0Sstevel@tonic-gate 563*0Sstevel@tonic-gate iov.iov_base = (char *)in_packet; 564*0Sstevel@tonic-gate iov.iov_len = sizeof (in_packet); 565*0Sstevel@tonic-gate msg.msg_iov = &iov; 566*0Sstevel@tonic-gate msg.msg_iovlen = 1; 567*0Sstevel@tonic-gate msg.msg_name = (struct sockaddr *)&from; 568*0Sstevel@tonic-gate msg.msg_namelen = sizeof (from); 569*0Sstevel@tonic-gate msg.msg_control = ancillary_data; 570*0Sstevel@tonic-gate msg.msg_controllen = sizeof (ancillary_data); 571*0Sstevel@tonic-gate 572*0Sstevel@tonic-gate if ((len = recvmsg(pii->pii_probe_sock, &msg, 0)) < 0) { 573*0Sstevel@tonic-gate logperror_pii(pii, "in6_data: recvfrom"); 574*0Sstevel@tonic-gate return; 575*0Sstevel@tonic-gate } 576*0Sstevel@tonic-gate 577*0Sstevel@tonic-gate /* 578*0Sstevel@tonic-gate * If the NIC has indicated that the link is down, don't go 579*0Sstevel@tonic-gate * any further. 580*0Sstevel@tonic-gate */ 581*0Sstevel@tonic-gate if (LINK_DOWN(pii->pii_phyint)) 582*0Sstevel@tonic-gate return; 583*0Sstevel@tonic-gate 584*0Sstevel@tonic-gate /* Get the printable address for error reporting */ 585*0Sstevel@tonic-gate (void) inet_ntop(AF_INET6, &from.sin6_addr, abuf, sizeof (abuf)); 586*0Sstevel@tonic-gate if (len < ICMP_MINLEN) { 587*0Sstevel@tonic-gate if (debug & D_PKTBAD) { 588*0Sstevel@tonic-gate logdebug("Truncated message: msg_flags 0x%x from %s\n", 589*0Sstevel@tonic-gate msg.msg_flags, abuf); 590*0Sstevel@tonic-gate } 591*0Sstevel@tonic-gate return; 592*0Sstevel@tonic-gate } 593*0Sstevel@tonic-gate /* Ignore packets > 64k or control buffers that don't fit */ 594*0Sstevel@tonic-gate if (msg.msg_flags & (MSG_TRUNC|MSG_CTRUNC)) { 595*0Sstevel@tonic-gate if (debug & D_PKTBAD) { 596*0Sstevel@tonic-gate logdebug("Truncated message: msg_flags 0x%x from %s\n", 597*0Sstevel@tonic-gate msg.msg_flags, abuf); 598*0Sstevel@tonic-gate } 599*0Sstevel@tonic-gate return; 600*0Sstevel@tonic-gate } 601*0Sstevel@tonic-gate 602*0Sstevel@tonic-gate reply = (struct pr_icmp *)in_packet; 603*0Sstevel@tonic-gate if (reply->pr_icmp_type != ICMP6_ECHO_REPLY) 604*0Sstevel@tonic-gate return; 605*0Sstevel@tonic-gate 606*0Sstevel@tonic-gate if (reply->pr_icmp_id != pii->pii_icmpid) { 607*0Sstevel@tonic-gate /* Not in response to our probe */ 608*0Sstevel@tonic-gate return; 609*0Sstevel@tonic-gate } 610*0Sstevel@tonic-gate 611*0Sstevel@tonic-gate /* 612*0Sstevel@tonic-gate * The kernel has already verified the the ICMP checksum. 613*0Sstevel@tonic-gate */ 614*0Sstevel@tonic-gate if (!IN6_IS_ADDR_LINKLOCAL(&from.sin6_addr)) { 615*0Sstevel@tonic-gate logtrace("ICMPv6 echo reply source address not linklocal from " 616*0Sstevel@tonic-gate "%s on %s\n", abuf, pii->pii_name); 617*0Sstevel@tonic-gate return; 618*0Sstevel@tonic-gate } 619*0Sstevel@tonic-gate opt = find_ancillary(&msg, IPV6_RTHDR); 620*0Sstevel@tonic-gate if (opt != NULL) { 621*0Sstevel@tonic-gate /* Can't allow routing headers in probe replies */ 622*0Sstevel@tonic-gate logtrace("message with routing header from %s on %s\n", 623*0Sstevel@tonic-gate abuf, pii->pii_name); 624*0Sstevel@tonic-gate return; 625*0Sstevel@tonic-gate } 626*0Sstevel@tonic-gate if (reply->pr_icmp_code != 0) { 627*0Sstevel@tonic-gate logtrace("probe reply code: %d from %s on %s\n", 628*0Sstevel@tonic-gate reply->pr_icmp_code, abuf, pii->pii_name); 629*0Sstevel@tonic-gate return; 630*0Sstevel@tonic-gate } 631*0Sstevel@tonic-gate if (len < (sizeof (struct pr_icmp))) { 632*0Sstevel@tonic-gate logtrace("probe reply too short: %d bytes from %s on %s\n", 633*0Sstevel@tonic-gate len, abuf, pii->pii_name); 634*0Sstevel@tonic-gate return; 635*0Sstevel@tonic-gate } 636*0Sstevel@tonic-gate if (reply->pr_icmp_mtype == htonl(PROBE_UNI)) { 637*0Sstevel@tonic-gate incoming_echo_reply(pii, reply, from.sin6_addr); 638*0Sstevel@tonic-gate } else if (reply->pr_icmp_mtype == htonl(PROBE_MULTI)) { 639*0Sstevel@tonic-gate incoming_mcast_reply(pii, reply, from.sin6_addr); 640*0Sstevel@tonic-gate } else if (reply->pr_icmp_mtype == htonl(PROBE_RTT)) { 641*0Sstevel@tonic-gate incoming_rtt_reply(pii, reply, from.sin6_addr); 642*0Sstevel@tonic-gate } else { 643*0Sstevel@tonic-gate /* Probably not in response to our probe */ 644*0Sstevel@tonic-gate logtrace("probe reply type: %d from %s on %s\n", 645*0Sstevel@tonic-gate reply->pr_icmp_mtype, abuf, pii->pii_name); 646*0Sstevel@tonic-gate } 647*0Sstevel@tonic-gate } 648*0Sstevel@tonic-gate 649*0Sstevel@tonic-gate /* 650*0Sstevel@tonic-gate * Process the incoming rtt reply, in response to our rtt probe. 651*0Sstevel@tonic-gate * Common for both IPv4 and IPv6. Unlike incoming_echo_reply() we don't 652*0Sstevel@tonic-gate * have any stored information about the probe we sent. So we don't log 653*0Sstevel@tonic-gate * any errors if we receive bad replies. 654*0Sstevel@tonic-gate */ 655*0Sstevel@tonic-gate static void 656*0Sstevel@tonic-gate incoming_rtt_reply(struct phyint_instance *pii, struct pr_icmp *reply, 657*0Sstevel@tonic-gate struct in6_addr fromaddr) 658*0Sstevel@tonic-gate { 659*0Sstevel@tonic-gate int m; /* rtt measurment in ms */ 660*0Sstevel@tonic-gate uint32_t cur_time; /* in ms from some arbitrary point */ 661*0Sstevel@tonic-gate char abuf[INET6_ADDRSTRLEN]; 662*0Sstevel@tonic-gate struct target *target; 663*0Sstevel@tonic-gate uint32_t pr_icmp_timestamp; 664*0Sstevel@tonic-gate struct phyint_group *pg; 665*0Sstevel@tonic-gate 666*0Sstevel@tonic-gate /* Get the printable address for error reporting */ 667*0Sstevel@tonic-gate (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)); 668*0Sstevel@tonic-gate 669*0Sstevel@tonic-gate if (debug & D_PROBE) { 670*0Sstevel@tonic-gate logdebug("incoming_rtt_reply: %s %s %s\n", 671*0Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name, abuf); 672*0Sstevel@tonic-gate } 673*0Sstevel@tonic-gate 674*0Sstevel@tonic-gate /* Do we know this target ? */ 675*0Sstevel@tonic-gate target = target_lookup(pii, fromaddr); 676*0Sstevel@tonic-gate if (target == NULL) 677*0Sstevel@tonic-gate return; 678*0Sstevel@tonic-gate 679*0Sstevel@tonic-gate pr_icmp_timestamp = ntohl(reply->pr_icmp_timestamp); 680*0Sstevel@tonic-gate cur_time = getcurrenttime(); 681*0Sstevel@tonic-gate m = (int)(cur_time - pr_icmp_timestamp); 682*0Sstevel@tonic-gate 683*0Sstevel@tonic-gate /* Invalid rtt. It has wrapped around */ 684*0Sstevel@tonic-gate if (m < 0) 685*0Sstevel@tonic-gate return; 686*0Sstevel@tonic-gate 687*0Sstevel@tonic-gate /* 688*0Sstevel@tonic-gate * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses 689*0Sstevel@tonic-gate * The initial few responses after the interface is repaired may 690*0Sstevel@tonic-gate * contain high rtt's because they could have been queued up waiting 691*0Sstevel@tonic-gate * for ARP/NDP resolution on a failed interface. 692*0Sstevel@tonic-gate */ 693*0Sstevel@tonic-gate pg = pii->pii_phyint->pi_group; 694*0Sstevel@tonic-gate if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg)) 695*0Sstevel@tonic-gate return; 696*0Sstevel@tonic-gate 697*0Sstevel@tonic-gate /* 698*0Sstevel@tonic-gate * Update rtt only if the new rtt is lower than the current rtt. 699*0Sstevel@tonic-gate * (specified by the 3rd parameter to pi_set_crtt). 700*0Sstevel@tonic-gate * If a spike has caused the current probe_interval to be > 701*0Sstevel@tonic-gate * user_probe_interval, then this mechanism is used to bring down 702*0Sstevel@tonic-gate * the rtt rapidly once the network stress is removed. 703*0Sstevel@tonic-gate * If the new rtt is higher than the current rtt, we don't want to 704*0Sstevel@tonic-gate * update the rtt. We are having more than 1 outstanding probe and 705*0Sstevel@tonic-gate * the increase in rtt we are seeing is being unnecessarily weighted 706*0Sstevel@tonic-gate * many times. The regular rtt update will be handled by 707*0Sstevel@tonic-gate * incoming_echo_reply() and will take care of any rtt increase. 708*0Sstevel@tonic-gate */ 709*0Sstevel@tonic-gate pi_set_crtt(target, m, _B_FALSE); 710*0Sstevel@tonic-gate if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) && 711*0Sstevel@tonic-gate (user_failure_detection_time < pg->pg_fdt) && 712*0Sstevel@tonic-gate (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) { 713*0Sstevel@tonic-gate /* 714*0Sstevel@tonic-gate * If the crtt has now dropped by a factor of LOWER_FT_TRIGGER, 715*0Sstevel@tonic-gate * investigate if we can improve the failure detection time to 716*0Sstevel@tonic-gate * meet whatever the user specified. 717*0Sstevel@tonic-gate */ 718*0Sstevel@tonic-gate if (check_pg_crtt_improved(pg)) { 719*0Sstevel@tonic-gate pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE, 720*0Sstevel@tonic-gate user_failure_detection_time); 721*0Sstevel@tonic-gate pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2); 722*0Sstevel@tonic-gate if (pii->pii_phyint->pi_group != phyint_anongroup) { 723*0Sstevel@tonic-gate logerr("Improved failure detection time %d ms " 724*0Sstevel@tonic-gate "on (%s %s) for group \"%s\"\n", 725*0Sstevel@tonic-gate pg->pg_fdt, AF_STR(pii->pii_af), 726*0Sstevel@tonic-gate pii->pii_name, 727*0Sstevel@tonic-gate pii->pii_phyint->pi_group->pg_name); 728*0Sstevel@tonic-gate } 729*0Sstevel@tonic-gate if (user_failure_detection_time == pg->pg_fdt) { 730*0Sstevel@tonic-gate /* Avoid any truncation or rounding errors */ 731*0Sstevel@tonic-gate pg->pg_probeint = user_probe_interval; 732*0Sstevel@tonic-gate /* 733*0Sstevel@tonic-gate * No more rtt probes will be sent. The actual 734*0Sstevel@tonic-gate * fdt has dropped to the user specified value. 735*0Sstevel@tonic-gate * pii_fd_snxt_basetime and pii_snxt_basetime 736*0Sstevel@tonic-gate * will be in sync henceforth. 737*0Sstevel@tonic-gate */ 738*0Sstevel@tonic-gate reset_snxt_basetimes(); 739*0Sstevel@tonic-gate } 740*0Sstevel@tonic-gate } 741*0Sstevel@tonic-gate } 742*0Sstevel@tonic-gate } 743*0Sstevel@tonic-gate 744*0Sstevel@tonic-gate /* 745*0Sstevel@tonic-gate * Process the incoming echo reply, in response to our unicast probe. 746*0Sstevel@tonic-gate * Common for both IPv4 and IPv6 747*0Sstevel@tonic-gate */ 748*0Sstevel@tonic-gate static void 749*0Sstevel@tonic-gate incoming_echo_reply(struct phyint_instance *pii, struct pr_icmp *reply, 750*0Sstevel@tonic-gate struct in6_addr fromaddr) 751*0Sstevel@tonic-gate { 752*0Sstevel@tonic-gate int m; /* rtt measurment in ms */ 753*0Sstevel@tonic-gate uint32_t cur_time; /* in ms from some arbitrary point */ 754*0Sstevel@tonic-gate char abuf[INET6_ADDRSTRLEN]; 755*0Sstevel@tonic-gate int pr_ndx; 756*0Sstevel@tonic-gate struct target *target; 757*0Sstevel@tonic-gate boolean_t exception; 758*0Sstevel@tonic-gate uint32_t pr_icmp_timestamp; 759*0Sstevel@tonic-gate uint16_t pr_icmp_seq; 760*0Sstevel@tonic-gate struct phyint_group *pg = pii->pii_phyint->pi_group; 761*0Sstevel@tonic-gate 762*0Sstevel@tonic-gate /* Get the printable address for error reporting */ 763*0Sstevel@tonic-gate (void) pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)); 764*0Sstevel@tonic-gate 765*0Sstevel@tonic-gate if (debug & D_PROBE) { 766*0Sstevel@tonic-gate logdebug("incoming_echo_reply: %s %s %s seq %u\n", 767*0Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name, abuf, 768*0Sstevel@tonic-gate ntohs(reply->pr_icmp_seq)); 769*0Sstevel@tonic-gate } 770*0Sstevel@tonic-gate 771*0Sstevel@tonic-gate pr_icmp_timestamp = ntohl(reply->pr_icmp_timestamp); 772*0Sstevel@tonic-gate pr_icmp_seq = ntohs(reply->pr_icmp_seq); 773*0Sstevel@tonic-gate 774*0Sstevel@tonic-gate /* Reject out of window probe replies */ 775*0Sstevel@tonic-gate if (SEQ_GE(pr_icmp_seq, pii->pii_snxt) || 776*0Sstevel@tonic-gate SEQ_LT(pr_icmp_seq, pii->pii_snxt - PROBE_STATS_COUNT)) { 777*0Sstevel@tonic-gate logtrace("out of window probe seq %u snxt %u on %s from %s\n", 778*0Sstevel@tonic-gate pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 779*0Sstevel@tonic-gate pii->pii_cum_stats.unknown++; 780*0Sstevel@tonic-gate return; 781*0Sstevel@tonic-gate } 782*0Sstevel@tonic-gate cur_time = getcurrenttime(); 783*0Sstevel@tonic-gate m = (int)(cur_time - pr_icmp_timestamp); 784*0Sstevel@tonic-gate if (m < 0) { 785*0Sstevel@tonic-gate /* 786*0Sstevel@tonic-gate * This is a ridiculously high value of rtt. rtt has wrapped 787*0Sstevel@tonic-gate * around. Log a message, and ignore the rtt. 788*0Sstevel@tonic-gate */ 789*0Sstevel@tonic-gate logerr("incoming_echo_reply: rtt wraparound cur_time %u reply " 790*0Sstevel@tonic-gate "timestamp %u\n", cur_time, pr_icmp_timestamp); 791*0Sstevel@tonic-gate } 792*0Sstevel@tonic-gate 793*0Sstevel@tonic-gate /* 794*0Sstevel@tonic-gate * Get the probe index pr_ndx corresponding to the received icmp seq. 795*0Sstevel@tonic-gate * number in our pii->pii_probes[] array. The icmp sequence number 796*0Sstevel@tonic-gate * pii_snxt corresponds to the probe index pii->pii_probe_next 797*0Sstevel@tonic-gate */ 798*0Sstevel@tonic-gate pr_ndx = MOD_SUB(pii->pii_probe_next, 799*0Sstevel@tonic-gate (uint16_t)(pii->pii_snxt - pr_icmp_seq), PROBE_STATS_COUNT); 800*0Sstevel@tonic-gate 801*0Sstevel@tonic-gate assert(PR_STATUS_VALID(pii->pii_probes[pr_ndx].pr_status)); 802*0Sstevel@tonic-gate 803*0Sstevel@tonic-gate target = pii->pii_probes[pr_ndx].pr_target; 804*0Sstevel@tonic-gate 805*0Sstevel@tonic-gate /* 806*0Sstevel@tonic-gate * Perform sanity checks, whether this probe reply that we 807*0Sstevel@tonic-gate * have received is genuine 808*0Sstevel@tonic-gate */ 809*0Sstevel@tonic-gate if (target != NULL) { 810*0Sstevel@tonic-gate /* 811*0Sstevel@tonic-gate * Compare the src. addr of the received ICMP or ICMPv6 812*0Sstevel@tonic-gate * probe reply with the target address in our tables. 813*0Sstevel@tonic-gate */ 814*0Sstevel@tonic-gate if (!IN6_ARE_ADDR_EQUAL(&target->tg_address, &fromaddr)) { 815*0Sstevel@tonic-gate /* 816*0Sstevel@tonic-gate * We don't have any record of having sent a probe to 817*0Sstevel@tonic-gate * this target. This is a fake probe reply. Log an error 818*0Sstevel@tonic-gate */ 819*0Sstevel@tonic-gate logtrace("probe status %d Fake probe reply seq %u " 820*0Sstevel@tonic-gate "snxt %u on %s from %s\n", 821*0Sstevel@tonic-gate pii->pii_probes[pr_ndx].pr_status, 822*0Sstevel@tonic-gate pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 823*0Sstevel@tonic-gate pii->pii_cum_stats.unknown++; 824*0Sstevel@tonic-gate return; 825*0Sstevel@tonic-gate } else if (pii->pii_probes[pr_ndx].pr_status == PR_ACKED) { 826*0Sstevel@tonic-gate /* 827*0Sstevel@tonic-gate * The address matches, but our tables indicate that 828*0Sstevel@tonic-gate * this probe reply has been acked already. So this 829*0Sstevel@tonic-gate * is a duplicate probe reply. Log an error 830*0Sstevel@tonic-gate */ 831*0Sstevel@tonic-gate logtrace("probe status %d Duplicate probe reply seq %u " 832*0Sstevel@tonic-gate "snxt %u on %s from %s\n", 833*0Sstevel@tonic-gate pii->pii_probes[pr_ndx].pr_status, 834*0Sstevel@tonic-gate pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 835*0Sstevel@tonic-gate pii->pii_cum_stats.unknown++; 836*0Sstevel@tonic-gate return; 837*0Sstevel@tonic-gate } 838*0Sstevel@tonic-gate } else { 839*0Sstevel@tonic-gate /* 840*0Sstevel@tonic-gate * Target must not be NULL in the PR_UNACKED state 841*0Sstevel@tonic-gate */ 842*0Sstevel@tonic-gate assert(pii->pii_probes[pr_ndx].pr_status != PR_UNACKED); 843*0Sstevel@tonic-gate if (pii->pii_probes[pr_ndx].pr_status == PR_UNUSED) { 844*0Sstevel@tonic-gate /* 845*0Sstevel@tonic-gate * The probe stats slot is unused. So we didn't 846*0Sstevel@tonic-gate * send out any probe to this target. This is a fake. 847*0Sstevel@tonic-gate * Log an error. 848*0Sstevel@tonic-gate */ 849*0Sstevel@tonic-gate logtrace("probe status %d Fake probe reply seq %u " 850*0Sstevel@tonic-gate "snxt %u on %s from %s\n", 851*0Sstevel@tonic-gate pii->pii_probes[pr_ndx].pr_status, 852*0Sstevel@tonic-gate pr_icmp_seq, pii->pii_snxt, pii->pii_name, abuf); 853*0Sstevel@tonic-gate } 854*0Sstevel@tonic-gate pii->pii_cum_stats.unknown++; 855*0Sstevel@tonic-gate return; 856*0Sstevel@tonic-gate } 857*0Sstevel@tonic-gate 858*0Sstevel@tonic-gate /* 859*0Sstevel@tonic-gate * If the rtt does not appear to be right, don't update the 860*0Sstevel@tonic-gate * rtt stats. This can happen if the system dropped into the 861*0Sstevel@tonic-gate * debugger, or the system was hung or too busy for a 862*0Sstevel@tonic-gate * substantial time that we didn't get a chance to run. 863*0Sstevel@tonic-gate */ 864*0Sstevel@tonic-gate if ((m < 0) || (m > PROBE_STATS_COUNT * pg->pg_probeint)) { 865*0Sstevel@tonic-gate /* 866*0Sstevel@tonic-gate * If the probe corresponding to this receieved response 867*0Sstevel@tonic-gate * was truly sent 'm' ms. ago, then this response must 868*0Sstevel@tonic-gate * have been rejected by the sequence number checks. The 869*0Sstevel@tonic-gate * fact that it has passed the sequence number checks 870*0Sstevel@tonic-gate * means that the measured rtt is wrong. We were probably 871*0Sstevel@tonic-gate * scheduled long after the packet was received. 872*0Sstevel@tonic-gate */ 873*0Sstevel@tonic-gate goto out; 874*0Sstevel@tonic-gate } 875*0Sstevel@tonic-gate 876*0Sstevel@tonic-gate /* 877*0Sstevel@tonic-gate * Don't update rtt until we see NUM_PROBE_REPAIRS probe responses 878*0Sstevel@tonic-gate * The initial few responses after the interface is repaired may 879*0Sstevel@tonic-gate * contain high rtt's because they could have been queued up waiting 880*0Sstevel@tonic-gate * for ARP/NDP resolution on a failed interface. 881*0Sstevel@tonic-gate */ 882*0Sstevel@tonic-gate if ((pii->pii_state != PI_RUNNING) || GROUP_FAILED(pg)) 883*0Sstevel@tonic-gate goto out; 884*0Sstevel@tonic-gate 885*0Sstevel@tonic-gate /* 886*0Sstevel@tonic-gate * Don't update the Conservative Round Trip Time estimate for this 887*0Sstevel@tonic-gate * (phint, target) pair if this is the not the highest ack seq seen 888*0Sstevel@tonic-gate * thus far on this target. 889*0Sstevel@tonic-gate */ 890*0Sstevel@tonic-gate if (!highest_ack_tg(pr_icmp_seq, target)) 891*0Sstevel@tonic-gate goto out; 892*0Sstevel@tonic-gate 893*0Sstevel@tonic-gate /* 894*0Sstevel@tonic-gate * Always update the rtt. This is a failure detection probe 895*0Sstevel@tonic-gate * and we want to measure both increase / decrease in rtt. 896*0Sstevel@tonic-gate */ 897*0Sstevel@tonic-gate pi_set_crtt(target, m, _B_TRUE); 898*0Sstevel@tonic-gate 899*0Sstevel@tonic-gate /* 900*0Sstevel@tonic-gate * If the crtt exceeds the average time between probes, 901*0Sstevel@tonic-gate * investigate if this slow target is an exception. If so we 902*0Sstevel@tonic-gate * can avoid this target and still meet the failure detection 903*0Sstevel@tonic-gate * time. Otherwise we can't meet the failure detection time. 904*0Sstevel@tonic-gate */ 905*0Sstevel@tonic-gate if (target->tg_crtt > pg->pg_probeint) { 906*0Sstevel@tonic-gate exception = check_exception_target(pii, target); 907*0Sstevel@tonic-gate if (exception) { 908*0Sstevel@tonic-gate /* 909*0Sstevel@tonic-gate * This target is exceptionally slow. Don't use it 910*0Sstevel@tonic-gate * for future probes. check_exception_target() has 911*0Sstevel@tonic-gate * made sure that we have at least MIN_PROBE_TARGETS 912*0Sstevel@tonic-gate * other active targets 913*0Sstevel@tonic-gate */ 914*0Sstevel@tonic-gate if (pii->pii_targets_are_routers) { 915*0Sstevel@tonic-gate /* 916*0Sstevel@tonic-gate * This is a slow router, mark it as slow 917*0Sstevel@tonic-gate * and don't use it for further probes. We 918*0Sstevel@tonic-gate * don't delete it, since it will be populated 919*0Sstevel@tonic-gate * again when we do a router scan. Hence we 920*0Sstevel@tonic-gate * need to maintain extra state (unlike the 921*0Sstevel@tonic-gate * host case below). Mark it as TG_SLOW. 922*0Sstevel@tonic-gate */ 923*0Sstevel@tonic-gate if (target->tg_status == TG_ACTIVE) 924*0Sstevel@tonic-gate pii->pii_ntargets--; 925*0Sstevel@tonic-gate target->tg_status = TG_SLOW; 926*0Sstevel@tonic-gate target->tg_latime = gethrtime(); 927*0Sstevel@tonic-gate target->tg_rtt_sa = -1; 928*0Sstevel@tonic-gate target->tg_crtt = 0; 929*0Sstevel@tonic-gate target->tg_rtt_sd = 0; 930*0Sstevel@tonic-gate if (pii->pii_target_next == target) { 931*0Sstevel@tonic-gate pii->pii_target_next = 932*0Sstevel@tonic-gate target_next(target); 933*0Sstevel@tonic-gate } 934*0Sstevel@tonic-gate } else { 935*0Sstevel@tonic-gate /* 936*0Sstevel@tonic-gate * the slow target is not a router, we can 937*0Sstevel@tonic-gate * just delete it. Send an icmp multicast and 938*0Sstevel@tonic-gate * pick the fastest responder that is not 939*0Sstevel@tonic-gate * already an active target. target_delete() 940*0Sstevel@tonic-gate * adjusts pii->pii_target_next 941*0Sstevel@tonic-gate */ 942*0Sstevel@tonic-gate target_delete(target); 943*0Sstevel@tonic-gate probe(pii, PROBE_MULTI, cur_time); 944*0Sstevel@tonic-gate } 945*0Sstevel@tonic-gate } else { 946*0Sstevel@tonic-gate /* 947*0Sstevel@tonic-gate * We can't meet the failure detection time. 948*0Sstevel@tonic-gate * Log a message, and update the detection time to 949*0Sstevel@tonic-gate * whatever we can achieve. 950*0Sstevel@tonic-gate */ 951*0Sstevel@tonic-gate pg->pg_probeint = target->tg_crtt * NEXT_FDT_MULTIPLE; 952*0Sstevel@tonic-gate pg->pg_fdt = pg->pg_probeint * (NUM_PROBE_FAILS + 2); 953*0Sstevel@tonic-gate last_fdt_bumpup_time = gethrtime(); 954*0Sstevel@tonic-gate if (pg != phyint_anongroup) { 955*0Sstevel@tonic-gate logerr("Cannot meet requested failure detection" 956*0Sstevel@tonic-gate " time of %d ms on (%s %s) new failure" 957*0Sstevel@tonic-gate " detection time for group \"%s\" is %d" 958*0Sstevel@tonic-gate " ms\n", user_failure_detection_time, 959*0Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name, 960*0Sstevel@tonic-gate pg->pg_name, pg->pg_fdt); 961*0Sstevel@tonic-gate } 962*0Sstevel@tonic-gate } 963*0Sstevel@tonic-gate } else if ((target->tg_crtt < (pg->pg_probeint / LOWER_FDT_TRIGGER)) && 964*0Sstevel@tonic-gate (user_failure_detection_time < pg->pg_fdt) && 965*0Sstevel@tonic-gate (last_fdt_bumpup_time + MIN_SETTLING_TIME < gethrtime())) { 966*0Sstevel@tonic-gate /* 967*0Sstevel@tonic-gate * If the crtt has now dropped by a factor of LOWER_FDT_TRIGGER 968*0Sstevel@tonic-gate * investigate if we can improve the failure detection time to 969*0Sstevel@tonic-gate * meet whatever the user specified. 970*0Sstevel@tonic-gate */ 971*0Sstevel@tonic-gate if (check_pg_crtt_improved(pg)) { 972*0Sstevel@tonic-gate pg->pg_fdt = MAX(pg->pg_fdt / NEXT_FDT_MULTIPLE, 973*0Sstevel@tonic-gate user_failure_detection_time); 974*0Sstevel@tonic-gate pg->pg_probeint = pg->pg_fdt / (NUM_PROBE_FAILS + 2); 975*0Sstevel@tonic-gate if (pg != phyint_anongroup) { 976*0Sstevel@tonic-gate logerr("Improved failure detection time %d ms " 977*0Sstevel@tonic-gate "on (%s %s) for group \"%s\"\n", pg->pg_fdt, 978*0Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name, 979*0Sstevel@tonic-gate pg->pg_name); 980*0Sstevel@tonic-gate } 981*0Sstevel@tonic-gate if (user_failure_detection_time == pg->pg_fdt) { 982*0Sstevel@tonic-gate /* Avoid any truncation or rounding errors */ 983*0Sstevel@tonic-gate pg->pg_probeint = user_probe_interval; 984*0Sstevel@tonic-gate /* 985*0Sstevel@tonic-gate * No more rtt probes will be sent. The actual 986*0Sstevel@tonic-gate * fdt has dropped to the user specified value. 987*0Sstevel@tonic-gate * pii_fd_snxt_basetime and pii_snxt_basetime 988*0Sstevel@tonic-gate * will be in sync henceforth. 989*0Sstevel@tonic-gate */ 990*0Sstevel@tonic-gate reset_snxt_basetimes(); 991*0Sstevel@tonic-gate } 992*0Sstevel@tonic-gate } 993*0Sstevel@tonic-gate } 994*0Sstevel@tonic-gate out: 995*0Sstevel@tonic-gate pii->pii_probes[pr_ndx].pr_status = PR_ACKED; 996*0Sstevel@tonic-gate pii->pii_probes[pr_ndx].pr_time_acked = cur_time; 997*0Sstevel@tonic-gate 998*0Sstevel@tonic-gate /* 999*0Sstevel@tonic-gate * Update pii->pii_rack, i.e. the sequence number of the last received 1000*0Sstevel@tonic-gate * probe response, based on the echo reply we have received now, if 1001*0Sstevel@tonic-gate * either of the following conditions are satisfied. 1002*0Sstevel@tonic-gate * a. pii_rack is outside the current receive window of 1003*0Sstevel@tonic-gate * [pii->pii_snxt - PROBE_STATS_COUNT, pii->pii_snxt). 1004*0Sstevel@tonic-gate * This means we have not received probe responses for a 1005*0Sstevel@tonic-gate * long time, and the sequence number has wrapped around. 1006*0Sstevel@tonic-gate * b. pii_rack is within the current receive window and this echo 1007*0Sstevel@tonic-gate * reply corresponds to the highest sequence number we have seen 1008*0Sstevel@tonic-gate * so far. 1009*0Sstevel@tonic-gate */ 1010*0Sstevel@tonic-gate if (SEQ_GE(pii->pii_rack, pii->pii_snxt) || 1011*0Sstevel@tonic-gate SEQ_LT(pii->pii_rack, pii->pii_snxt - PROBE_STATS_COUNT) || 1012*0Sstevel@tonic-gate SEQ_GT(pr_icmp_seq, pii->pii_rack)) { 1013*0Sstevel@tonic-gate pii->pii_rack = pr_icmp_seq; 1014*0Sstevel@tonic-gate } 1015*0Sstevel@tonic-gate } 1016*0Sstevel@tonic-gate 1017*0Sstevel@tonic-gate /* 1018*0Sstevel@tonic-gate * Returns true if seq is the highest unacknowledged seq for target tg 1019*0Sstevel@tonic-gate * else returns false 1020*0Sstevel@tonic-gate */ 1021*0Sstevel@tonic-gate static boolean_t 1022*0Sstevel@tonic-gate highest_ack_tg(uint16_t seq, struct target *tg) 1023*0Sstevel@tonic-gate { 1024*0Sstevel@tonic-gate struct phyint_instance *pii; 1025*0Sstevel@tonic-gate int pr_ndx; 1026*0Sstevel@tonic-gate uint16_t pr_seq; 1027*0Sstevel@tonic-gate 1028*0Sstevel@tonic-gate pii = tg->tg_phyint_inst; 1029*0Sstevel@tonic-gate 1030*0Sstevel@tonic-gate /* 1031*0Sstevel@tonic-gate * Get the seq number of the most recent probe sent so far, 1032*0Sstevel@tonic-gate * and also get the corresponding probe index in the probe stats 1033*0Sstevel@tonic-gate * array. 1034*0Sstevel@tonic-gate */ 1035*0Sstevel@tonic-gate pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 1036*0Sstevel@tonic-gate pr_seq = pii->pii_snxt; 1037*0Sstevel@tonic-gate pr_seq--; 1038*0Sstevel@tonic-gate 1039*0Sstevel@tonic-gate /* 1040*0Sstevel@tonic-gate * Start from the most recent probe and walk back, trying to find 1041*0Sstevel@tonic-gate * an acked probe corresponding to target tg. 1042*0Sstevel@tonic-gate */ 1043*0Sstevel@tonic-gate for (; pr_ndx != pii->pii_probe_next; 1044*0Sstevel@tonic-gate pr_ndx = PROBE_INDEX_PREV(pr_ndx), pr_seq--) { 1045*0Sstevel@tonic-gate if (pii->pii_probes[pr_ndx].pr_target == tg && 1046*0Sstevel@tonic-gate pii->pii_probes[pr_ndx].pr_status == PR_ACKED) { 1047*0Sstevel@tonic-gate if (SEQ_GT(pr_seq, seq)) 1048*0Sstevel@tonic-gate return (_B_FALSE); 1049*0Sstevel@tonic-gate } 1050*0Sstevel@tonic-gate } 1051*0Sstevel@tonic-gate return (_B_TRUE); 1052*0Sstevel@tonic-gate } 1053*0Sstevel@tonic-gate 1054*0Sstevel@tonic-gate /* 1055*0Sstevel@tonic-gate * Check whether the crtt for the group has improved by a factor of 1056*0Sstevel@tonic-gate * LOWER_FDT_TRIGGER. Small crtt improvements are ignored to avoid failure 1057*0Sstevel@tonic-gate * detection time flapping in the face of small crtt changes. 1058*0Sstevel@tonic-gate */ 1059*0Sstevel@tonic-gate static boolean_t 1060*0Sstevel@tonic-gate check_pg_crtt_improved(struct phyint_group *pg) 1061*0Sstevel@tonic-gate { 1062*0Sstevel@tonic-gate struct phyint *pi; 1063*0Sstevel@tonic-gate 1064*0Sstevel@tonic-gate if (debug & D_PROBE) 1065*0Sstevel@tonic-gate logdebug("check_pg_crtt_improved()\n"); 1066*0Sstevel@tonic-gate 1067*0Sstevel@tonic-gate /* 1068*0Sstevel@tonic-gate * The crtt for the group is only improved if each phyint_instance 1069*0Sstevel@tonic-gate * for both ipv4 and ipv6 is improved. 1070*0Sstevel@tonic-gate */ 1071*0Sstevel@tonic-gate for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 1072*0Sstevel@tonic-gate if (!check_pii_crtt_improved(pi->pi_v4) || 1073*0Sstevel@tonic-gate !check_pii_crtt_improved(pi->pi_v6)) 1074*0Sstevel@tonic-gate return (_B_FALSE); 1075*0Sstevel@tonic-gate } 1076*0Sstevel@tonic-gate 1077*0Sstevel@tonic-gate return (_B_TRUE); 1078*0Sstevel@tonic-gate } 1079*0Sstevel@tonic-gate 1080*0Sstevel@tonic-gate /* 1081*0Sstevel@tonic-gate * Check whether the crtt has improved substantially on this phyint_instance. 1082*0Sstevel@tonic-gate * Returns _B_TRUE if there's no crtt information available, because pii 1083*0Sstevel@tonic-gate * is NULL or the phyint_instance is not capable of probing. 1084*0Sstevel@tonic-gate */ 1085*0Sstevel@tonic-gate boolean_t 1086*0Sstevel@tonic-gate check_pii_crtt_improved(struct phyint_instance *pii) { 1087*0Sstevel@tonic-gate struct target *tg; 1088*0Sstevel@tonic-gate 1089*0Sstevel@tonic-gate if (pii == NULL) 1090*0Sstevel@tonic-gate return (_B_TRUE); 1091*0Sstevel@tonic-gate 1092*0Sstevel@tonic-gate if (!PROBE_CAPABLE(pii) || 1093*0Sstevel@tonic-gate pii->pii_phyint->pi_state == PI_FAILED) 1094*0Sstevel@tonic-gate return (_B_TRUE); 1095*0Sstevel@tonic-gate 1096*0Sstevel@tonic-gate for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1097*0Sstevel@tonic-gate if (tg->tg_status != TG_ACTIVE) 1098*0Sstevel@tonic-gate continue; 1099*0Sstevel@tonic-gate if (tg->tg_crtt > (pii->pii_phyint->pi_group->pg_probeint / 1100*0Sstevel@tonic-gate LOWER_FDT_TRIGGER)) { 1101*0Sstevel@tonic-gate return (_B_FALSE); 1102*0Sstevel@tonic-gate } 1103*0Sstevel@tonic-gate } 1104*0Sstevel@tonic-gate 1105*0Sstevel@tonic-gate return (_B_TRUE); 1106*0Sstevel@tonic-gate } 1107*0Sstevel@tonic-gate 1108*0Sstevel@tonic-gate /* 1109*0Sstevel@tonic-gate * This target responds very slowly to probes. The target's crtt exceeds 1110*0Sstevel@tonic-gate * the probe interval of its group. Compare against other targets 1111*0Sstevel@tonic-gate * and determine if this target is an exception, if so return true, else false 1112*0Sstevel@tonic-gate */ 1113*0Sstevel@tonic-gate static boolean_t 1114*0Sstevel@tonic-gate check_exception_target(struct phyint_instance *pii, struct target *target) 1115*0Sstevel@tonic-gate { 1116*0Sstevel@tonic-gate struct target *tg; 1117*0Sstevel@tonic-gate char abuf[INET6_ADDRSTRLEN]; 1118*0Sstevel@tonic-gate 1119*0Sstevel@tonic-gate if (debug & D_PROBE) { 1120*0Sstevel@tonic-gate logdebug("check_exception_target(%s %s target %s)\n", 1121*0Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name, 1122*0Sstevel@tonic-gate pr_addr(pii->pii_af, target->tg_address, 1123*0Sstevel@tonic-gate abuf, sizeof (abuf))); 1124*0Sstevel@tonic-gate } 1125*0Sstevel@tonic-gate 1126*0Sstevel@tonic-gate /* 1127*0Sstevel@tonic-gate * We should have at least MIN_PROBE_TARGETS + 1 good targets now, 1128*0Sstevel@tonic-gate * to make a good judgement. Otherwise don't drop this target. 1129*0Sstevel@tonic-gate */ 1130*0Sstevel@tonic-gate if (pii->pii_ntargets < MIN_PROBE_TARGETS + 1) 1131*0Sstevel@tonic-gate return (_B_FALSE); 1132*0Sstevel@tonic-gate 1133*0Sstevel@tonic-gate /* 1134*0Sstevel@tonic-gate * Determine whether only this particular target is slow. 1135*0Sstevel@tonic-gate * We know that this target's crtt exceeds the group's probe interval. 1136*0Sstevel@tonic-gate * If all other active targets have a 1137*0Sstevel@tonic-gate * crtt < (this group's probe interval) / EXCEPTION_FACTOR, 1138*0Sstevel@tonic-gate * then this target is considered slow. 1139*0Sstevel@tonic-gate */ 1140*0Sstevel@tonic-gate for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1141*0Sstevel@tonic-gate if (tg != target && tg->tg_status == TG_ACTIVE) { 1142*0Sstevel@tonic-gate if (tg->tg_crtt > 1143*0Sstevel@tonic-gate pii->pii_phyint->pi_group->pg_probeint / 1144*0Sstevel@tonic-gate EXCEPTION_FACTOR) { 1145*0Sstevel@tonic-gate return (_B_FALSE); 1146*0Sstevel@tonic-gate } 1147*0Sstevel@tonic-gate } 1148*0Sstevel@tonic-gate } 1149*0Sstevel@tonic-gate 1150*0Sstevel@tonic-gate return (_B_TRUE); 1151*0Sstevel@tonic-gate } 1152*0Sstevel@tonic-gate 1153*0Sstevel@tonic-gate /* 1154*0Sstevel@tonic-gate * Update the target list. The icmp all hosts multicast has given us 1155*0Sstevel@tonic-gate * some host to which we can send probes. If we already have sufficient 1156*0Sstevel@tonic-gate * targets, discard it. 1157*0Sstevel@tonic-gate */ 1158*0Sstevel@tonic-gate static void 1159*0Sstevel@tonic-gate incoming_mcast_reply(struct phyint_instance *pii, struct pr_icmp *reply, 1160*0Sstevel@tonic-gate struct in6_addr fromaddr) 1161*0Sstevel@tonic-gate /* ARGSUSED */ 1162*0Sstevel@tonic-gate { 1163*0Sstevel@tonic-gate int af; 1164*0Sstevel@tonic-gate char abuf[INET6_ADDRSTRLEN]; 1165*0Sstevel@tonic-gate struct phyint *pi; 1166*0Sstevel@tonic-gate 1167*0Sstevel@tonic-gate if (debug & D_PROBE) { 1168*0Sstevel@tonic-gate logdebug("incoming_mcast_reply(%s %s %s)\n", 1169*0Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name, 1170*0Sstevel@tonic-gate pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf))); 1171*0Sstevel@tonic-gate } 1172*0Sstevel@tonic-gate 1173*0Sstevel@tonic-gate /* 1174*0Sstevel@tonic-gate * Using host targets is a fallback mechanism. If we have 1175*0Sstevel@tonic-gate * found a router, don't add this host target. If we already 1176*0Sstevel@tonic-gate * know MAX_PROBE_TARGETS, don't add another target. 1177*0Sstevel@tonic-gate */ 1178*0Sstevel@tonic-gate assert(pii->pii_ntargets <= MAX_PROBE_TARGETS); 1179*0Sstevel@tonic-gate if (pii->pii_targets != NULL) { 1180*0Sstevel@tonic-gate if (pii->pii_targets_are_routers || 1181*0Sstevel@tonic-gate (pii->pii_ntargets == MAX_PROBE_TARGETS)) { 1182*0Sstevel@tonic-gate return; 1183*0Sstevel@tonic-gate } 1184*0Sstevel@tonic-gate } 1185*0Sstevel@tonic-gate 1186*0Sstevel@tonic-gate if (IN6_IS_ADDR_UNSPECIFIED(&fromaddr) || 1187*0Sstevel@tonic-gate IN6_IS_ADDR_V4MAPPED_ANY(&fromaddr)) { 1188*0Sstevel@tonic-gate /* 1189*0Sstevel@tonic-gate * Guard against response from 0.0.0.0 1190*0Sstevel@tonic-gate * and ::. Log a trace message 1191*0Sstevel@tonic-gate */ 1192*0Sstevel@tonic-gate logtrace("probe response from %s on %s\n", 1193*0Sstevel@tonic-gate pr_addr(pii->pii_af, fromaddr, abuf, sizeof (abuf)), 1194*0Sstevel@tonic-gate pii->pii_name); 1195*0Sstevel@tonic-gate return; 1196*0Sstevel@tonic-gate } 1197*0Sstevel@tonic-gate 1198*0Sstevel@tonic-gate /* 1199*0Sstevel@tonic-gate * This address is one of our own, so reject this address as a 1200*0Sstevel@tonic-gate * valid probe target. 1201*0Sstevel@tonic-gate */ 1202*0Sstevel@tonic-gate af = pii->pii_af; 1203*0Sstevel@tonic-gate if (own_address(af, fromaddr)) 1204*0Sstevel@tonic-gate return; 1205*0Sstevel@tonic-gate 1206*0Sstevel@tonic-gate /* 1207*0Sstevel@tonic-gate * If the phyint is part a named group, then add the address to all 1208*0Sstevel@tonic-gate * members of the group. Otherwise, add the address only to the 1209*0Sstevel@tonic-gate * phyint itself, since other phyints in the anongroup may not be on 1210*0Sstevel@tonic-gate * the same subnet. 1211*0Sstevel@tonic-gate */ 1212*0Sstevel@tonic-gate pi = pii->pii_phyint; 1213*0Sstevel@tonic-gate if (pi->pi_group == phyint_anongroup) { 1214*0Sstevel@tonic-gate target_add(pii, fromaddr, _B_FALSE); 1215*0Sstevel@tonic-gate } else { 1216*0Sstevel@tonic-gate pi = pi->pi_group->pg_phyint; 1217*0Sstevel@tonic-gate for (; pi != NULL; pi = pi->pi_pgnext) 1218*0Sstevel@tonic-gate target_add(PHYINT_INSTANCE(pi, af), fromaddr, _B_FALSE); 1219*0Sstevel@tonic-gate } 1220*0Sstevel@tonic-gate } 1221*0Sstevel@tonic-gate 1222*0Sstevel@tonic-gate /* 1223*0Sstevel@tonic-gate * Compute CRTT given an existing scaled average, scaled deviation estimate 1224*0Sstevel@tonic-gate * and a new rtt time. The formula is from Jacobson and Karels' 1225*0Sstevel@tonic-gate * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names 1226*0Sstevel@tonic-gate * are the same as those in Appendix A.2 of that paper. 1227*0Sstevel@tonic-gate * 1228*0Sstevel@tonic-gate * m = new measurement 1229*0Sstevel@tonic-gate * sa = scaled RTT average (8 * average estimates) 1230*0Sstevel@tonic-gate * sv = scaled mean deviation (mdev) of RTT (4 * deviation estimates). 1231*0Sstevel@tonic-gate * crtt = Conservative round trip time. Used to determine whether probe 1232*0Sstevel@tonic-gate * has timed out. 1233*0Sstevel@tonic-gate * 1234*0Sstevel@tonic-gate * New scaled average and deviation are passed back via sap and svp 1235*0Sstevel@tonic-gate */ 1236*0Sstevel@tonic-gate static int 1237*0Sstevel@tonic-gate compute_crtt(int *sap, int *svp, int m) 1238*0Sstevel@tonic-gate { 1239*0Sstevel@tonic-gate int sa = *sap; 1240*0Sstevel@tonic-gate int sv = *svp; 1241*0Sstevel@tonic-gate int crtt; 1242*0Sstevel@tonic-gate int saved_m = m; 1243*0Sstevel@tonic-gate 1244*0Sstevel@tonic-gate assert(*sap >= -1); 1245*0Sstevel@tonic-gate assert(*svp >= 0); 1246*0Sstevel@tonic-gate 1247*0Sstevel@tonic-gate if (sa != -1) { 1248*0Sstevel@tonic-gate /* 1249*0Sstevel@tonic-gate * Update average estimator: 1250*0Sstevel@tonic-gate * new rtt = old rtt + 1/8 Error 1251*0Sstevel@tonic-gate * where Error = m - old rtt 1252*0Sstevel@tonic-gate * i.e. 8 * new rtt = 8 * old rtt + Error 1253*0Sstevel@tonic-gate * i.e. new sa = old sa + Error 1254*0Sstevel@tonic-gate */ 1255*0Sstevel@tonic-gate m -= sa >> 3; /* m is now Error in estimate. */ 1256*0Sstevel@tonic-gate if ((sa += m) < 0) { 1257*0Sstevel@tonic-gate /* Don't allow the smoothed average to be negative. */ 1258*0Sstevel@tonic-gate sa = 0; 1259*0Sstevel@tonic-gate } 1260*0Sstevel@tonic-gate 1261*0Sstevel@tonic-gate /* 1262*0Sstevel@tonic-gate * Update deviation estimator: 1263*0Sstevel@tonic-gate * new mdev = old mdev + 1/4 (abs(Error) - old mdev) 1264*0Sstevel@tonic-gate * i.e. 4 * new mdev = 4 * old mdev + 1265*0Sstevel@tonic-gate * (abs(Error) - old mdev) 1266*0Sstevel@tonic-gate * i.e. new sv = old sv + (abs(Error) - old mdev) 1267*0Sstevel@tonic-gate */ 1268*0Sstevel@tonic-gate if (m < 0) 1269*0Sstevel@tonic-gate m = -m; 1270*0Sstevel@tonic-gate m -= sv >> 2; 1271*0Sstevel@tonic-gate sv += m; 1272*0Sstevel@tonic-gate } else { 1273*0Sstevel@tonic-gate /* Initialization. This is the first response received. */ 1274*0Sstevel@tonic-gate sa = (m << 3); 1275*0Sstevel@tonic-gate sv = (m << 1); 1276*0Sstevel@tonic-gate } 1277*0Sstevel@tonic-gate 1278*0Sstevel@tonic-gate crtt = (sa >> 3) + sv; 1279*0Sstevel@tonic-gate 1280*0Sstevel@tonic-gate if (debug & D_PROBE) { 1281*0Sstevel@tonic-gate logdebug("compute_crtt: m = %d sa = %d, sv = %d -> crtt = " 1282*0Sstevel@tonic-gate "%d\n", saved_m, sa, sv, crtt); 1283*0Sstevel@tonic-gate } 1284*0Sstevel@tonic-gate 1285*0Sstevel@tonic-gate *sap = sa; 1286*0Sstevel@tonic-gate *svp = sv; 1287*0Sstevel@tonic-gate 1288*0Sstevel@tonic-gate /* 1289*0Sstevel@tonic-gate * CRTT = average estimates + 4 * deviation estimates 1290*0Sstevel@tonic-gate * = sa / 8 + sv 1291*0Sstevel@tonic-gate */ 1292*0Sstevel@tonic-gate return (crtt); 1293*0Sstevel@tonic-gate } 1294*0Sstevel@tonic-gate 1295*0Sstevel@tonic-gate static void 1296*0Sstevel@tonic-gate pi_set_crtt(struct target *tg, int m, boolean_t is_probe_uni) 1297*0Sstevel@tonic-gate { 1298*0Sstevel@tonic-gate struct phyint_instance *pii = tg->tg_phyint_inst; 1299*0Sstevel@tonic-gate int probe_interval = pii->pii_phyint->pi_group->pg_probeint; 1300*0Sstevel@tonic-gate int sa = tg->tg_rtt_sa; 1301*0Sstevel@tonic-gate int sv = tg->tg_rtt_sd; 1302*0Sstevel@tonic-gate int new_crtt; 1303*0Sstevel@tonic-gate int i; 1304*0Sstevel@tonic-gate 1305*0Sstevel@tonic-gate if (debug & D_PROBE) 1306*0Sstevel@tonic-gate logdebug("pi_set_crtt: target - m %d\n", m); 1307*0Sstevel@tonic-gate 1308*0Sstevel@tonic-gate /* store the round trip time, in case we need to defer computation */ 1309*0Sstevel@tonic-gate tg->tg_deferred[tg->tg_num_deferred] = m; 1310*0Sstevel@tonic-gate 1311*0Sstevel@tonic-gate new_crtt = compute_crtt(&sa, &sv, m); 1312*0Sstevel@tonic-gate 1313*0Sstevel@tonic-gate /* 1314*0Sstevel@tonic-gate * If this probe's round trip time would singlehandedly cause an 1315*0Sstevel@tonic-gate * increase in the group's probe interval consider it suspect. 1316*0Sstevel@tonic-gate */ 1317*0Sstevel@tonic-gate if ((new_crtt > probe_interval) && is_probe_uni) { 1318*0Sstevel@tonic-gate if (debug & D_PROBE) { 1319*0Sstevel@tonic-gate logdebug("Received a suspect probe on %s, new_crtt =" 1320*0Sstevel@tonic-gate " %d, probe_interval = %d, num_deferred = %d\n", 1321*0Sstevel@tonic-gate pii->pii_probe_logint->li_name, new_crtt, 1322*0Sstevel@tonic-gate probe_interval, tg->tg_num_deferred); 1323*0Sstevel@tonic-gate } 1324*0Sstevel@tonic-gate 1325*0Sstevel@tonic-gate /* 1326*0Sstevel@tonic-gate * If we've deferred as many rtts as we plan on deferring, then 1327*0Sstevel@tonic-gate * assume the link really did slow down and process all queued 1328*0Sstevel@tonic-gate * rtts 1329*0Sstevel@tonic-gate */ 1330*0Sstevel@tonic-gate if (tg->tg_num_deferred == MAXDEFERREDRTT) { 1331*0Sstevel@tonic-gate if (debug & D_PROBE) { 1332*0Sstevel@tonic-gate logdebug("Received MAXDEFERREDRTT probes which " 1333*0Sstevel@tonic-gate "would cause an increased probe_interval. " 1334*0Sstevel@tonic-gate "Integrating queued rtt data points.\n"); 1335*0Sstevel@tonic-gate } 1336*0Sstevel@tonic-gate 1337*0Sstevel@tonic-gate for (i = 0; i <= tg->tg_num_deferred; i++) { 1338*0Sstevel@tonic-gate tg->tg_crtt = compute_crtt(&tg->tg_rtt_sa, 1339*0Sstevel@tonic-gate &tg->tg_rtt_sd, tg->tg_deferred[i]); 1340*0Sstevel@tonic-gate } 1341*0Sstevel@tonic-gate 1342*0Sstevel@tonic-gate tg->tg_num_deferred = 0; 1343*0Sstevel@tonic-gate } else { 1344*0Sstevel@tonic-gate tg->tg_num_deferred++; 1345*0Sstevel@tonic-gate } 1346*0Sstevel@tonic-gate return; 1347*0Sstevel@tonic-gate } 1348*0Sstevel@tonic-gate 1349*0Sstevel@tonic-gate /* 1350*0Sstevel@tonic-gate * If this is a normal probe, or an RTT probe that would lead to a 1351*0Sstevel@tonic-gate * reduced CRTT, then update our CRTT data. Further, if this was 1352*0Sstevel@tonic-gate * a normal probe, pitch any deferred probes since our probes are 1353*0Sstevel@tonic-gate * again being answered within our CRTT estimates. 1354*0Sstevel@tonic-gate */ 1355*0Sstevel@tonic-gate if (is_probe_uni || new_crtt < tg->tg_crtt) { 1356*0Sstevel@tonic-gate tg->tg_rtt_sa = sa; 1357*0Sstevel@tonic-gate tg->tg_rtt_sd = sv; 1358*0Sstevel@tonic-gate tg->tg_crtt = new_crtt; 1359*0Sstevel@tonic-gate if (is_probe_uni) 1360*0Sstevel@tonic-gate tg->tg_num_deferred = 0; 1361*0Sstevel@tonic-gate } 1362*0Sstevel@tonic-gate } 1363*0Sstevel@tonic-gate 1364*0Sstevel@tonic-gate /* 1365*0Sstevel@tonic-gate * Return a pointer to the specified option buffer. 1366*0Sstevel@tonic-gate * If not found return NULL. 1367*0Sstevel@tonic-gate */ 1368*0Sstevel@tonic-gate static void * 1369*0Sstevel@tonic-gate find_ancillary(struct msghdr *msg, int cmsg_type) 1370*0Sstevel@tonic-gate { 1371*0Sstevel@tonic-gate struct cmsghdr *cmsg; 1372*0Sstevel@tonic-gate 1373*0Sstevel@tonic-gate for (cmsg = CMSG_FIRSTHDR(msg); cmsg != NULL; 1374*0Sstevel@tonic-gate cmsg = CMSG_NXTHDR(msg, cmsg)) { 1375*0Sstevel@tonic-gate if (cmsg->cmsg_level == IPPROTO_IPV6 && 1376*0Sstevel@tonic-gate cmsg->cmsg_type == cmsg_type) { 1377*0Sstevel@tonic-gate return (CMSG_DATA(cmsg)); 1378*0Sstevel@tonic-gate } 1379*0Sstevel@tonic-gate } 1380*0Sstevel@tonic-gate return (NULL); 1381*0Sstevel@tonic-gate } 1382*0Sstevel@tonic-gate 1383*0Sstevel@tonic-gate /* 1384*0Sstevel@tonic-gate * See if a previously failed interface has started working again. 1385*0Sstevel@tonic-gate */ 1386*0Sstevel@tonic-gate void 1387*0Sstevel@tonic-gate phyint_check_for_repair(struct phyint *pi) 1388*0Sstevel@tonic-gate { 1389*0Sstevel@tonic-gate if (phyint_repaired(pi)) { 1390*0Sstevel@tonic-gate if (pi->pi_group == phyint_anongroup) { 1391*0Sstevel@tonic-gate logerr("NIC repair detected on %s\n", pi->pi_name); 1392*0Sstevel@tonic-gate } else { 1393*0Sstevel@tonic-gate logerr("NIC repair detected on %s of group %s\n", 1394*0Sstevel@tonic-gate pi->pi_name, pi->pi_group->pg_name); 1395*0Sstevel@tonic-gate } 1396*0Sstevel@tonic-gate 1397*0Sstevel@tonic-gate /* 1398*0Sstevel@tonic-gate * If the interface is offline, just clear the FAILED flag, 1399*0Sstevel@tonic-gate * delaying the state change and failback operation until it 1400*0Sstevel@tonic-gate * is brought back online. 1401*0Sstevel@tonic-gate */ 1402*0Sstevel@tonic-gate if (pi->pi_state == PI_OFFLINE) { 1403*0Sstevel@tonic-gate (void) change_lif_flags(pi, IFF_FAILED, _B_FALSE); 1404*0Sstevel@tonic-gate return; 1405*0Sstevel@tonic-gate } 1406*0Sstevel@tonic-gate 1407*0Sstevel@tonic-gate if (pi->pi_flags & IFF_INACTIVE) { 1408*0Sstevel@tonic-gate (void) change_lif_flags(pi, IFF_FAILED, _B_FALSE); 1409*0Sstevel@tonic-gate } else { 1410*0Sstevel@tonic-gate if (try_failback(pi, _B_FALSE) != IPMP_FAILURE) { 1411*0Sstevel@tonic-gate (void) change_lif_flags(pi, 1412*0Sstevel@tonic-gate IFF_FAILED, _B_FALSE); 1413*0Sstevel@tonic-gate /* Per state diagram */ 1414*0Sstevel@tonic-gate pi->pi_empty = 0; 1415*0Sstevel@tonic-gate } 1416*0Sstevel@tonic-gate } 1417*0Sstevel@tonic-gate 1418*0Sstevel@tonic-gate phyint_chstate(pi, PI_RUNNING); 1419*0Sstevel@tonic-gate 1420*0Sstevel@tonic-gate if (GROUP_FAILED(pi->pi_group)) { 1421*0Sstevel@tonic-gate /* 1422*0Sstevel@tonic-gate * This is the 1st phyint to receive a response 1423*0Sstevel@tonic-gate * after group failure. 1424*0Sstevel@tonic-gate */ 1425*0Sstevel@tonic-gate logerr("At least 1 interface (%s) of group %s has " 1426*0Sstevel@tonic-gate "repaired\n", pi->pi_name, pi->pi_group->pg_name); 1427*0Sstevel@tonic-gate phyint_group_chstate(pi->pi_group, PG_RUNNING); 1428*0Sstevel@tonic-gate } 1429*0Sstevel@tonic-gate } 1430*0Sstevel@tonic-gate } 1431*0Sstevel@tonic-gate 1432*0Sstevel@tonic-gate /* 1433*0Sstevel@tonic-gate * See if a previously functioning interface has failed, or if the 1434*0Sstevel@tonic-gate * whole group of interfaces has failed. 1435*0Sstevel@tonic-gate */ 1436*0Sstevel@tonic-gate static void 1437*0Sstevel@tonic-gate phyint_inst_check_for_failure(struct phyint_instance *pii) 1438*0Sstevel@tonic-gate { 1439*0Sstevel@tonic-gate struct phyint *pi; 1440*0Sstevel@tonic-gate struct phyint *pi2; 1441*0Sstevel@tonic-gate 1442*0Sstevel@tonic-gate pi = pii->pii_phyint; 1443*0Sstevel@tonic-gate 1444*0Sstevel@tonic-gate switch (failure_state(pii)) { 1445*0Sstevel@tonic-gate case PHYINT_FAILURE: 1446*0Sstevel@tonic-gate (void) change_lif_flags(pi, IFF_FAILED, _B_TRUE); 1447*0Sstevel@tonic-gate if (pi->pi_group == phyint_anongroup) { 1448*0Sstevel@tonic-gate logerr("NIC failure detected on %s\n", pii->pii_name); 1449*0Sstevel@tonic-gate } else { 1450*0Sstevel@tonic-gate logerr("NIC failure detected on %s of group %s\n", 1451*0Sstevel@tonic-gate pii->pii_name, pi->pi_group->pg_name); 1452*0Sstevel@tonic-gate } 1453*0Sstevel@tonic-gate /* 1454*0Sstevel@tonic-gate * Do the failover, unless the interface is offline (in 1455*0Sstevel@tonic-gate * which case we've already failed over). 1456*0Sstevel@tonic-gate */ 1457*0Sstevel@tonic-gate if (pi->pi_state != PI_OFFLINE) { 1458*0Sstevel@tonic-gate phyint_chstate(pi, PI_FAILED); 1459*0Sstevel@tonic-gate reset_crtt_all(pi); 1460*0Sstevel@tonic-gate if (!(pi->pi_flags & IFF_INACTIVE)) 1461*0Sstevel@tonic-gate (void) try_failover(pi, FAILOVER_NORMAL); 1462*0Sstevel@tonic-gate } 1463*0Sstevel@tonic-gate break; 1464*0Sstevel@tonic-gate 1465*0Sstevel@tonic-gate case GROUP_FAILURE: 1466*0Sstevel@tonic-gate logerr("All Interfaces in group %s have failed\n", 1467*0Sstevel@tonic-gate pi->pi_group->pg_name); 1468*0Sstevel@tonic-gate for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; 1469*0Sstevel@tonic-gate pi2 = pi2->pi_pgnext) { 1470*0Sstevel@tonic-gate if (pi2->pi_flags & IFF_OFFLINE) 1471*0Sstevel@tonic-gate continue; 1472*0Sstevel@tonic-gate (void) change_lif_flags(pi2, IFF_FAILED, _B_TRUE); 1473*0Sstevel@tonic-gate reset_crtt_all(pi2); 1474*0Sstevel@tonic-gate 1475*0Sstevel@tonic-gate /* 1476*0Sstevel@tonic-gate * In the case of host targets, we 1477*0Sstevel@tonic-gate * would have flushed the targets, 1478*0Sstevel@tonic-gate * and gone to PI_NOTARGETS state. 1479*0Sstevel@tonic-gate */ 1480*0Sstevel@tonic-gate if (pi2->pi_state == PI_RUNNING) 1481*0Sstevel@tonic-gate phyint_chstate(pi, PI_FAILED); 1482*0Sstevel@tonic-gate 1483*0Sstevel@tonic-gate pi2->pi_empty = 0; 1484*0Sstevel@tonic-gate pi2->pi_full = 0; 1485*0Sstevel@tonic-gate } 1486*0Sstevel@tonic-gate break; 1487*0Sstevel@tonic-gate 1488*0Sstevel@tonic-gate default: 1489*0Sstevel@tonic-gate break; 1490*0Sstevel@tonic-gate } 1491*0Sstevel@tonic-gate } 1492*0Sstevel@tonic-gate 1493*0Sstevel@tonic-gate /* 1494*0Sstevel@tonic-gate * Determines if any timeout event has occurred and returns the number of 1495*0Sstevel@tonic-gate * milliseconds until the next timeout event for the phyint. Returns 1496*0Sstevel@tonic-gate * TIMER_INFINITY for "never". 1497*0Sstevel@tonic-gate */ 1498*0Sstevel@tonic-gate uint_t 1499*0Sstevel@tonic-gate phyint_inst_timer(struct phyint_instance *pii) 1500*0Sstevel@tonic-gate { 1501*0Sstevel@tonic-gate int pr_ndx; 1502*0Sstevel@tonic-gate uint_t timeout; 1503*0Sstevel@tonic-gate struct target *cur_tg; 1504*0Sstevel@tonic-gate struct probe_stats *pr_statp; 1505*0Sstevel@tonic-gate struct phyint_instance *pii_other; 1506*0Sstevel@tonic-gate struct phyint *pi; 1507*0Sstevel@tonic-gate int valid_unack_count; 1508*0Sstevel@tonic-gate int i; 1509*0Sstevel@tonic-gate int interval; 1510*0Sstevel@tonic-gate uint_t check_time; 1511*0Sstevel@tonic-gate uint_t cur_time; 1512*0Sstevel@tonic-gate hrtime_t cur_hrtime; 1513*0Sstevel@tonic-gate int probe_interval = pii->pii_phyint->pi_group->pg_probeint; 1514*0Sstevel@tonic-gate 1515*0Sstevel@tonic-gate cur_time = getcurrenttime(); 1516*0Sstevel@tonic-gate 1517*0Sstevel@tonic-gate if (debug & D_TIMER) { 1518*0Sstevel@tonic-gate logdebug("phyint_inst_timer(%s %s)\n", 1519*0Sstevel@tonic-gate AF_STR(pii->pii_af), pii->pii_name); 1520*0Sstevel@tonic-gate } 1521*0Sstevel@tonic-gate 1522*0Sstevel@tonic-gate pii_other = phyint_inst_other(pii); 1523*0Sstevel@tonic-gate if (!PROBE_ENABLED(pii) && !PROBE_ENABLED(pii_other)) { 1524*0Sstevel@tonic-gate /* 1525*0Sstevel@tonic-gate * Check to see if we're here due to link up/down flapping; If 1526*0Sstevel@tonic-gate * enough time has passed, then try to bring the interface 1527*0Sstevel@tonic-gate * back up; otherwise, schedule a timer to bring it back up 1528*0Sstevel@tonic-gate * when enough time *has* elapsed. 1529*0Sstevel@tonic-gate */ 1530*0Sstevel@tonic-gate pi = pii->pii_phyint; 1531*0Sstevel@tonic-gate if (pi->pi_state == PI_FAILED && LINK_UP(pi)) { 1532*0Sstevel@tonic-gate check_time = pi->pi_whenup[pi->pi_whendx] + MSEC_PERMIN; 1533*0Sstevel@tonic-gate if (check_time > cur_time) 1534*0Sstevel@tonic-gate return (check_time - cur_time); 1535*0Sstevel@tonic-gate 1536*0Sstevel@tonic-gate phyint_check_for_repair(pi); 1537*0Sstevel@tonic-gate } 1538*0Sstevel@tonic-gate } 1539*0Sstevel@tonic-gate 1540*0Sstevel@tonic-gate /* 1541*0Sstevel@tonic-gate * If this phyint is not yet initialized for probes, 1542*0Sstevel@tonic-gate * don't proceed further 1543*0Sstevel@tonic-gate */ 1544*0Sstevel@tonic-gate if (pii->pii_probe_sock == -1) 1545*0Sstevel@tonic-gate return (TIMER_INFINITY); 1546*0Sstevel@tonic-gate 1547*0Sstevel@tonic-gate /* 1548*0Sstevel@tonic-gate * If the timer has fired too soon, probably triggered 1549*0Sstevel@tonic-gate * by some other phyint instance, return the remaining 1550*0Sstevel@tonic-gate * time 1551*0Sstevel@tonic-gate */ 1552*0Sstevel@tonic-gate if (TIME_LT(cur_time, pii->pii_snxt_time)) 1553*0Sstevel@tonic-gate return (pii->pii_snxt_time - cur_time); 1554*0Sstevel@tonic-gate 1555*0Sstevel@tonic-gate /* 1556*0Sstevel@tonic-gate * If the link is down, don't send any probes for now. 1557*0Sstevel@tonic-gate */ 1558*0Sstevel@tonic-gate if (LINK_DOWN(pii->pii_phyint)) 1559*0Sstevel@tonic-gate return (TIMER_INFINITY); 1560*0Sstevel@tonic-gate 1561*0Sstevel@tonic-gate /* 1562*0Sstevel@tonic-gate * Randomize the next probe time, between MIN_RANDOM_FACTOR 1563*0Sstevel@tonic-gate * and MAX_RANDOM_FACTOR with respect to the base probe time. 1564*0Sstevel@tonic-gate * Base probe time is strictly periodic. 1565*0Sstevel@tonic-gate */ 1566*0Sstevel@tonic-gate interval = GET_RANDOM( 1567*0Sstevel@tonic-gate (int)(MIN_RANDOM_FACTOR * user_probe_interval), 1568*0Sstevel@tonic-gate (int)(MAX_RANDOM_FACTOR * user_probe_interval)); 1569*0Sstevel@tonic-gate pii->pii_snxt_time = pii->pii_snxt_basetime + interval; 1570*0Sstevel@tonic-gate 1571*0Sstevel@tonic-gate /* 1572*0Sstevel@tonic-gate * Check if the current time > next time to probe. If so, we missed 1573*0Sstevel@tonic-gate * sending 1 or more probes, probably due to heavy system load. At least 1574*0Sstevel@tonic-gate * 'MIN_RANDOM_FACTOR * user_probe_interval' ms has elapsed since we 1575*0Sstevel@tonic-gate * were scheduled. Make adjustments to the times, in multiples of 1576*0Sstevel@tonic-gate * user_probe_interval. 1577*0Sstevel@tonic-gate */ 1578*0Sstevel@tonic-gate if (TIME_GT(cur_time, pii->pii_snxt_time)) { 1579*0Sstevel@tonic-gate int n; 1580*0Sstevel@tonic-gate 1581*0Sstevel@tonic-gate n = (cur_time - pii->pii_snxt_time) / user_probe_interval; 1582*0Sstevel@tonic-gate pii->pii_snxt_time += (n + 1) * user_probe_interval; 1583*0Sstevel@tonic-gate pii->pii_snxt_basetime += (n + 1) * user_probe_interval; 1584*0Sstevel@tonic-gate logtrace("missed sending %d probes cur_time %u snxt_time %u" 1585*0Sstevel@tonic-gate " snxt_basetime %u\n", n + 1, cur_time, pii->pii_snxt_time, 1586*0Sstevel@tonic-gate pii->pii_snxt_basetime); 1587*0Sstevel@tonic-gate 1588*0Sstevel@tonic-gate /* Collect statistics about missed probes */ 1589*0Sstevel@tonic-gate probes_missed.pm_nprobes += n + 1; 1590*0Sstevel@tonic-gate probes_missed.pm_ntimes++; 1591*0Sstevel@tonic-gate } 1592*0Sstevel@tonic-gate pii->pii_snxt_basetime += user_probe_interval; 1593*0Sstevel@tonic-gate interval = pii->pii_snxt_time - cur_time; 1594*0Sstevel@tonic-gate if (debug & D_TARGET) { 1595*0Sstevel@tonic-gate logdebug("cur_time %u snxt_time %u snxt_basetime %u" 1596*0Sstevel@tonic-gate " interval %u\n", cur_time, pii->pii_snxt_time, 1597*0Sstevel@tonic-gate pii->pii_snxt_basetime, interval); 1598*0Sstevel@tonic-gate } 1599*0Sstevel@tonic-gate 1600*0Sstevel@tonic-gate /* 1601*0Sstevel@tonic-gate * If no targets are known, we need to send an ICMP multicast. The 1602*0Sstevel@tonic-gate * probe type is PROBE_MULTI. We'll check back in 'interval' msec 1603*0Sstevel@tonic-gate * to see if we found a target. 1604*0Sstevel@tonic-gate */ 1605*0Sstevel@tonic-gate if (pii->pii_target_next == NULL) { 1606*0Sstevel@tonic-gate assert(pii->pii_ntargets == 0); 1607*0Sstevel@tonic-gate pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 1608*0Sstevel@tonic-gate probe(pii, PROBE_MULTI, cur_time); 1609*0Sstevel@tonic-gate return (interval); 1610*0Sstevel@tonic-gate } 1611*0Sstevel@tonic-gate 1612*0Sstevel@tonic-gate if ((user_probe_interval != probe_interval) && 1613*0Sstevel@tonic-gate TIME_LT(pii->pii_snxt_time, pii->pii_fd_snxt_basetime)) { 1614*0Sstevel@tonic-gate /* 1615*0Sstevel@tonic-gate * the failure detection (fd) probe timer has not yet fired. 1616*0Sstevel@tonic-gate * Need to send only an rtt probe. The probe type is PROBE_RTT. 1617*0Sstevel@tonic-gate */ 1618*0Sstevel@tonic-gate probe(pii, PROBE_RTT, cur_time); 1619*0Sstevel@tonic-gate return (interval); 1620*0Sstevel@tonic-gate } 1621*0Sstevel@tonic-gate /* 1622*0Sstevel@tonic-gate * the fd probe timer has fired. Need to do all failure 1623*0Sstevel@tonic-gate * detection / recovery calculations, and then send an fd probe 1624*0Sstevel@tonic-gate * of type PROBE_UNI. 1625*0Sstevel@tonic-gate */ 1626*0Sstevel@tonic-gate if (user_probe_interval == probe_interval) { 1627*0Sstevel@tonic-gate /* 1628*0Sstevel@tonic-gate * We could have missed some probes, and then adjusted 1629*0Sstevel@tonic-gate * pii_snxt_basetime above. Otherwise we could have 1630*0Sstevel@tonic-gate * blindly added probe_interval to pii_fd_snxt_basetime. 1631*0Sstevel@tonic-gate */ 1632*0Sstevel@tonic-gate pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 1633*0Sstevel@tonic-gate } else { 1634*0Sstevel@tonic-gate pii->pii_fd_snxt_basetime += probe_interval; 1635*0Sstevel@tonic-gate if (TIME_GT(cur_time, pii->pii_fd_snxt_basetime)) { 1636*0Sstevel@tonic-gate int n; 1637*0Sstevel@tonic-gate 1638*0Sstevel@tonic-gate n = (cur_time - pii->pii_fd_snxt_basetime) / 1639*0Sstevel@tonic-gate probe_interval; 1640*0Sstevel@tonic-gate pii->pii_fd_snxt_basetime += (n + 1) * probe_interval; 1641*0Sstevel@tonic-gate } 1642*0Sstevel@tonic-gate } 1643*0Sstevel@tonic-gate 1644*0Sstevel@tonic-gate /* 1645*0Sstevel@tonic-gate * We can have at most, the latest 2 probes that we sent, in 1646*0Sstevel@tonic-gate * the PR_UNACKED state. All previous probes sent, are either 1647*0Sstevel@tonic-gate * PR_LOST or PR_ACKED. An unacknowledged probe is considered 1648*0Sstevel@tonic-gate * timed out if the probe's time_sent + the CRTT < currenttime. 1649*0Sstevel@tonic-gate * For each of the last 2 probes, examine whether it has timed 1650*0Sstevel@tonic-gate * out. If so, mark it PR_LOST. The probe stats is a circular array. 1651*0Sstevel@tonic-gate */ 1652*0Sstevel@tonic-gate pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 1653*0Sstevel@tonic-gate valid_unack_count = 0; 1654*0Sstevel@tonic-gate 1655*0Sstevel@tonic-gate for (i = 0; i < 2; i++) { 1656*0Sstevel@tonic-gate pr_statp = &pii->pii_probes[pr_ndx]; 1657*0Sstevel@tonic-gate cur_tg = pii->pii_probes[pr_ndx].pr_target; 1658*0Sstevel@tonic-gate switch (pr_statp->pr_status) { 1659*0Sstevel@tonic-gate case PR_ACKED: 1660*0Sstevel@tonic-gate /* 1661*0Sstevel@tonic-gate * We received back an ACK, so the switch clearly 1662*0Sstevel@tonic-gate * is not dropping our traffic, and thus we can 1663*0Sstevel@tonic-gate * enable failure detection immediately. 1664*0Sstevel@tonic-gate */ 1665*0Sstevel@tonic-gate if (pii->pii_fd_hrtime > gethrtime()) { 1666*0Sstevel@tonic-gate if (debug & D_PROBE) { 1667*0Sstevel@tonic-gate logdebug("successful probe on %s; " 1668*0Sstevel@tonic-gate "ending quiet period\n", 1669*0Sstevel@tonic-gate pii->pii_phyint->pi_name); 1670*0Sstevel@tonic-gate } 1671*0Sstevel@tonic-gate pii->pii_fd_hrtime = gethrtime(); 1672*0Sstevel@tonic-gate } 1673*0Sstevel@tonic-gate break; 1674*0Sstevel@tonic-gate 1675*0Sstevel@tonic-gate case PR_UNACKED: 1676*0Sstevel@tonic-gate assert(cur_tg != NULL); 1677*0Sstevel@tonic-gate /* 1678*0Sstevel@tonic-gate * The crtt could be zero for some reason, 1679*0Sstevel@tonic-gate * Eg. the phyint could be failed. If the crtt is 1680*0Sstevel@tonic-gate * not available use group's probe interval, 1681*0Sstevel@tonic-gate * which is a worst case estimate. 1682*0Sstevel@tonic-gate */ 1683*0Sstevel@tonic-gate if (cur_tg->tg_crtt != 0) { 1684*0Sstevel@tonic-gate timeout = pr_statp->pr_time_sent + 1685*0Sstevel@tonic-gate cur_tg->tg_crtt; 1686*0Sstevel@tonic-gate } else { 1687*0Sstevel@tonic-gate timeout = pr_statp->pr_time_sent + 1688*0Sstevel@tonic-gate probe_interval; 1689*0Sstevel@tonic-gate } 1690*0Sstevel@tonic-gate if (TIME_LT(timeout, cur_time)) { 1691*0Sstevel@tonic-gate pr_statp->pr_status = PR_LOST; 1692*0Sstevel@tonic-gate pr_statp->pr_time_lost = timeout; 1693*0Sstevel@tonic-gate } else if (i == 1) { 1694*0Sstevel@tonic-gate /* 1695*0Sstevel@tonic-gate * We are forced to consider this probe 1696*0Sstevel@tonic-gate * lost, as we can have at most 2 unack. 1697*0Sstevel@tonic-gate * probes any time, and we will be sending a 1698*0Sstevel@tonic-gate * probe at the end of this function. 1699*0Sstevel@tonic-gate * Normally, we should not be here, but 1700*0Sstevel@tonic-gate * this can happen if an incoming response 1701*0Sstevel@tonic-gate * that was considered lost has increased 1702*0Sstevel@tonic-gate * the crtt for this target, and also bumped 1703*0Sstevel@tonic-gate * up the FDT. Note that we never cancel or 1704*0Sstevel@tonic-gate * increase the current pii_time_left, so 1705*0Sstevel@tonic-gate * when the timer fires, we find 2 valid 1706*0Sstevel@tonic-gate * unacked probes, and they are yet to timeout 1707*0Sstevel@tonic-gate */ 1708*0Sstevel@tonic-gate pr_statp->pr_status = PR_LOST; 1709*0Sstevel@tonic-gate pr_statp->pr_time_lost = cur_time; 1710*0Sstevel@tonic-gate } else { 1711*0Sstevel@tonic-gate /* 1712*0Sstevel@tonic-gate * Only the most recent probe can enter 1713*0Sstevel@tonic-gate * this 'else' arm. The second most recent 1714*0Sstevel@tonic-gate * probe must take either of the above arms, 1715*0Sstevel@tonic-gate * if it is unacked. 1716*0Sstevel@tonic-gate */ 1717*0Sstevel@tonic-gate valid_unack_count++; 1718*0Sstevel@tonic-gate } 1719*0Sstevel@tonic-gate break; 1720*0Sstevel@tonic-gate } 1721*0Sstevel@tonic-gate pr_ndx = PROBE_INDEX_PREV(pr_ndx); 1722*0Sstevel@tonic-gate } 1723*0Sstevel@tonic-gate 1724*0Sstevel@tonic-gate /* 1725*0Sstevel@tonic-gate * We send out 1 probe randomly in the interval between one half 1726*0Sstevel@tonic-gate * and one probe interval for the group. Given that the CRTT is always 1727*0Sstevel@tonic-gate * less than the group's probe interval, we can have at most 1 1728*0Sstevel@tonic-gate * unacknowledged probe now. All previous probes are either lost or 1729*0Sstevel@tonic-gate * acked. 1730*0Sstevel@tonic-gate */ 1731*0Sstevel@tonic-gate assert(valid_unack_count == 0 || valid_unack_count == 1); 1732*0Sstevel@tonic-gate 1733*0Sstevel@tonic-gate /* 1734*0Sstevel@tonic-gate * The timer has fired. Take appropriate action depending 1735*0Sstevel@tonic-gate * on the current state of the phyint. 1736*0Sstevel@tonic-gate * 1737*0Sstevel@tonic-gate * PI_RUNNING state - Failure detection and failover 1738*0Sstevel@tonic-gate * PI_FAILED state - Repair detection and failback 1739*0Sstevel@tonic-gate */ 1740*0Sstevel@tonic-gate switch (pii->pii_phyint->pi_state) { 1741*0Sstevel@tonic-gate case PI_FAILED: 1742*0Sstevel@tonic-gate /* 1743*0Sstevel@tonic-gate * If the most recent probe (excluding unacked probes that 1744*0Sstevel@tonic-gate * are yet to time out) has been acked, check whether the 1745*0Sstevel@tonic-gate * phyint is now repaired. If the phyint is repaired, then 1746*0Sstevel@tonic-gate * attempt failback, unless it is an inactive standby. 1747*0Sstevel@tonic-gate */ 1748*0Sstevel@tonic-gate if (pii->pii_rack + valid_unack_count + 1 == pii->pii_snxt) { 1749*0Sstevel@tonic-gate phyint_check_for_repair(pii->pii_phyint); 1750*0Sstevel@tonic-gate } 1751*0Sstevel@tonic-gate break; 1752*0Sstevel@tonic-gate 1753*0Sstevel@tonic-gate case PI_RUNNING: 1754*0Sstevel@tonic-gate /* 1755*0Sstevel@tonic-gate * It's possible our probes have been lost because of a 1756*0Sstevel@tonic-gate * spanning-tree mandated quiet period on the switch. If so, 1757*0Sstevel@tonic-gate * ignore the lost probes and consider the interface to still 1758*0Sstevel@tonic-gate * be functioning. 1759*0Sstevel@tonic-gate */ 1760*0Sstevel@tonic-gate cur_hrtime = gethrtime(); 1761*0Sstevel@tonic-gate if (pii->pii_fd_hrtime - cur_hrtime > 0) 1762*0Sstevel@tonic-gate break; 1763*0Sstevel@tonic-gate 1764*0Sstevel@tonic-gate if (pii->pii_rack + valid_unack_count + 1 != pii->pii_snxt) { 1765*0Sstevel@tonic-gate /* 1766*0Sstevel@tonic-gate * We have 1 or more failed probes (excluding unacked 1767*0Sstevel@tonic-gate * probes that are yet to time out). Determine if the 1768*0Sstevel@tonic-gate * phyint has failed. If so attempt a failover, 1769*0Sstevel@tonic-gate * unless it is an inactive standby 1770*0Sstevel@tonic-gate */ 1771*0Sstevel@tonic-gate phyint_inst_check_for_failure(pii); 1772*0Sstevel@tonic-gate } 1773*0Sstevel@tonic-gate break; 1774*0Sstevel@tonic-gate 1775*0Sstevel@tonic-gate default: 1776*0Sstevel@tonic-gate logerr("phyint_inst_timer: invalid state %d\n", 1777*0Sstevel@tonic-gate pii->pii_phyint->pi_state); 1778*0Sstevel@tonic-gate abort(); 1779*0Sstevel@tonic-gate } 1780*0Sstevel@tonic-gate 1781*0Sstevel@tonic-gate /* 1782*0Sstevel@tonic-gate * Start the next probe. probe() will also set pii->pii_probe_time_left 1783*0Sstevel@tonic-gate * to the group's probe interval. If phyint_failed -> target_flush_hosts 1784*0Sstevel@tonic-gate * was called, the target list may be empty. 1785*0Sstevel@tonic-gate */ 1786*0Sstevel@tonic-gate if (pii->pii_target_next != NULL) { 1787*0Sstevel@tonic-gate probe(pii, PROBE_UNI, cur_time); 1788*0Sstevel@tonic-gate /* 1789*0Sstevel@tonic-gate * If we have just the one probe target, and we're not using 1790*0Sstevel@tonic-gate * router targets, try to find another as we presently have 1791*0Sstevel@tonic-gate * no resilience. 1792*0Sstevel@tonic-gate */ 1793*0Sstevel@tonic-gate if (!pii->pii_targets_are_routers && pii->pii_ntargets == 1) 1794*0Sstevel@tonic-gate probe(pii, PROBE_MULTI, cur_time); 1795*0Sstevel@tonic-gate } else { 1796*0Sstevel@tonic-gate probe(pii, PROBE_MULTI, cur_time); 1797*0Sstevel@tonic-gate } 1798*0Sstevel@tonic-gate return (interval); 1799*0Sstevel@tonic-gate } 1800*0Sstevel@tonic-gate 1801*0Sstevel@tonic-gate /* 1802*0Sstevel@tonic-gate * Start the probe timer for an interface instance. 1803*0Sstevel@tonic-gate */ 1804*0Sstevel@tonic-gate void 1805*0Sstevel@tonic-gate start_timer(struct phyint_instance *pii) 1806*0Sstevel@tonic-gate { 1807*0Sstevel@tonic-gate uint32_t interval; 1808*0Sstevel@tonic-gate 1809*0Sstevel@tonic-gate /* 1810*0Sstevel@tonic-gate * Spread the base probe times (pi_snxt_basetime) across phyints 1811*0Sstevel@tonic-gate * uniformly over the (curtime..curtime + the group's probe_interval). 1812*0Sstevel@tonic-gate * pi_snxt_basetime is strictly periodic with a frequency of 1813*0Sstevel@tonic-gate * the group's probe interval. The actual probe time pi_snxt_time 1814*0Sstevel@tonic-gate * adds some randomness to pi_snxt_basetime and happens in probe(). 1815*0Sstevel@tonic-gate * For the 1st probe on each phyint after the timer is started, 1816*0Sstevel@tonic-gate * pi_snxt_time and pi_snxt_basetime are the same. 1817*0Sstevel@tonic-gate */ 1818*0Sstevel@tonic-gate interval = GET_RANDOM(0, 1819*0Sstevel@tonic-gate (int)pii->pii_phyint->pi_group->pg_probeint); 1820*0Sstevel@tonic-gate 1821*0Sstevel@tonic-gate pii->pii_snxt_basetime = getcurrenttime() + interval; 1822*0Sstevel@tonic-gate pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 1823*0Sstevel@tonic-gate pii->pii_snxt_time = pii->pii_snxt_basetime; 1824*0Sstevel@tonic-gate timer_schedule(interval); 1825*0Sstevel@tonic-gate } 1826*0Sstevel@tonic-gate 1827*0Sstevel@tonic-gate /* 1828*0Sstevel@tonic-gate * Restart the probe timer on an interface instance. 1829*0Sstevel@tonic-gate */ 1830*0Sstevel@tonic-gate static void 1831*0Sstevel@tonic-gate restart_timer(struct phyint_instance *pii) 1832*0Sstevel@tonic-gate { 1833*0Sstevel@tonic-gate /* 1834*0Sstevel@tonic-gate * We don't need to restart the timer if it was never started in 1835*0Sstevel@tonic-gate * the first place (pii->pii_basetime_inited not set), as the timer 1836*0Sstevel@tonic-gate * won't have gone off yet. 1837*0Sstevel@tonic-gate */ 1838*0Sstevel@tonic-gate if (pii->pii_basetime_inited != 0) { 1839*0Sstevel@tonic-gate 1840*0Sstevel@tonic-gate if (debug & D_LINKNOTE) 1841*0Sstevel@tonic-gate logdebug("restart timer: restarting timer on %s, " 1842*0Sstevel@tonic-gate "address family %s\n", pii->pii_phyint->pi_name, 1843*0Sstevel@tonic-gate AF_STR(pii->pii_af)); 1844*0Sstevel@tonic-gate 1845*0Sstevel@tonic-gate start_timer(pii); 1846*0Sstevel@tonic-gate } 1847*0Sstevel@tonic-gate } 1848*0Sstevel@tonic-gate 1849*0Sstevel@tonic-gate static void 1850*0Sstevel@tonic-gate process_link_state_down(struct phyint *pi) 1851*0Sstevel@tonic-gate { 1852*0Sstevel@tonic-gate logerr("The link has gone down on %s\n", pi->pi_name); 1853*0Sstevel@tonic-gate 1854*0Sstevel@tonic-gate /* 1855*0Sstevel@tonic-gate * Clear the probe statistics arrays, we don't want the repair 1856*0Sstevel@tonic-gate * detection logic relying on probes that were succesful prior 1857*0Sstevel@tonic-gate * to the link going down. 1858*0Sstevel@tonic-gate */ 1859*0Sstevel@tonic-gate if (PROBE_CAPABLE(pi->pi_v4)) 1860*0Sstevel@tonic-gate clear_pii_probe_stats(pi->pi_v4); 1861*0Sstevel@tonic-gate if (PROBE_CAPABLE(pi->pi_v6)) 1862*0Sstevel@tonic-gate clear_pii_probe_stats(pi->pi_v6); 1863*0Sstevel@tonic-gate /* 1864*0Sstevel@tonic-gate * Check for interface failure. Although we know the interface 1865*0Sstevel@tonic-gate * has failed, we don't know if all the other interfaces in the 1866*0Sstevel@tonic-gate * group have failed as well. 1867*0Sstevel@tonic-gate */ 1868*0Sstevel@tonic-gate if ((pi->pi_state == PI_RUNNING) || 1869*0Sstevel@tonic-gate (pi->pi_state != PI_FAILED && !GROUP_FAILED(pi->pi_group))) { 1870*0Sstevel@tonic-gate if (debug & D_LINKNOTE) { 1871*0Sstevel@tonic-gate logdebug("process_link_state_down:" 1872*0Sstevel@tonic-gate " checking for failure on %s\n", pi->pi_name); 1873*0Sstevel@tonic-gate } 1874*0Sstevel@tonic-gate 1875*0Sstevel@tonic-gate if (pi->pi_v4 != NULL) 1876*0Sstevel@tonic-gate phyint_inst_check_for_failure(pi->pi_v4); 1877*0Sstevel@tonic-gate else if (pi->pi_v6 != NULL) 1878*0Sstevel@tonic-gate phyint_inst_check_for_failure(pi->pi_v6); 1879*0Sstevel@tonic-gate } 1880*0Sstevel@tonic-gate } 1881*0Sstevel@tonic-gate 1882*0Sstevel@tonic-gate static void 1883*0Sstevel@tonic-gate process_link_state_up(struct phyint *pi) 1884*0Sstevel@tonic-gate { 1885*0Sstevel@tonic-gate logerr("The link has come up on %s\n", pi->pi_name); 1886*0Sstevel@tonic-gate 1887*0Sstevel@tonic-gate /* 1888*0Sstevel@tonic-gate * We stopped any running timers on each instance when the link 1889*0Sstevel@tonic-gate * went down, so restart them. 1890*0Sstevel@tonic-gate */ 1891*0Sstevel@tonic-gate if (pi->pi_v4) 1892*0Sstevel@tonic-gate restart_timer(pi->pi_v4); 1893*0Sstevel@tonic-gate if (pi->pi_v6) 1894*0Sstevel@tonic-gate restart_timer(pi->pi_v6); 1895*0Sstevel@tonic-gate 1896*0Sstevel@tonic-gate phyint_check_for_repair(pi); 1897*0Sstevel@tonic-gate 1898*0Sstevel@tonic-gate pi->pi_whenup[pi->pi_whendx++] = getcurrenttime(); 1899*0Sstevel@tonic-gate if (pi->pi_whendx == LINK_UP_PERMIN) 1900*0Sstevel@tonic-gate pi->pi_whendx = 0; 1901*0Sstevel@tonic-gate } 1902*0Sstevel@tonic-gate 1903*0Sstevel@tonic-gate /* 1904*0Sstevel@tonic-gate * Process any changes in link state passed up from the interfaces. 1905*0Sstevel@tonic-gate */ 1906*0Sstevel@tonic-gate void 1907*0Sstevel@tonic-gate process_link_state_changes(void) 1908*0Sstevel@tonic-gate { 1909*0Sstevel@tonic-gate struct phyint *pi; 1910*0Sstevel@tonic-gate 1911*0Sstevel@tonic-gate /* Look for interfaces where the link state has just changed */ 1912*0Sstevel@tonic-gate 1913*0Sstevel@tonic-gate for (pi = phyints; pi != NULL; pi = pi->pi_next) { 1914*0Sstevel@tonic-gate boolean_t old_link_state_up = LINK_UP(pi); 1915*0Sstevel@tonic-gate 1916*0Sstevel@tonic-gate /* 1917*0Sstevel@tonic-gate * Except when the "phyint" structure is created, this is 1918*0Sstevel@tonic-gate * the only place the link state is updated. This allows 1919*0Sstevel@tonic-gate * this routine to detect changes in link state, rather 1920*0Sstevel@tonic-gate * than just the current state. 1921*0Sstevel@tonic-gate */ 1922*0Sstevel@tonic-gate UPDATE_LINK_STATE(pi); 1923*0Sstevel@tonic-gate 1924*0Sstevel@tonic-gate if (LINK_DOWN(pi)) { 1925*0Sstevel@tonic-gate /* 1926*0Sstevel@tonic-gate * Has link just gone down? 1927*0Sstevel@tonic-gate */ 1928*0Sstevel@tonic-gate if (old_link_state_up) 1929*0Sstevel@tonic-gate process_link_state_down(pi); 1930*0Sstevel@tonic-gate } else { 1931*0Sstevel@tonic-gate /* 1932*0Sstevel@tonic-gate * Has link just gone back up? 1933*0Sstevel@tonic-gate */ 1934*0Sstevel@tonic-gate if (!old_link_state_up) 1935*0Sstevel@tonic-gate process_link_state_up(pi); 1936*0Sstevel@tonic-gate } 1937*0Sstevel@tonic-gate } 1938*0Sstevel@tonic-gate } 1939*0Sstevel@tonic-gate 1940*0Sstevel@tonic-gate void 1941*0Sstevel@tonic-gate reset_crtt_all(struct phyint *pi) 1942*0Sstevel@tonic-gate { 1943*0Sstevel@tonic-gate struct phyint_instance *pii; 1944*0Sstevel@tonic-gate struct target *tg; 1945*0Sstevel@tonic-gate 1946*0Sstevel@tonic-gate pii = pi->pi_v4; 1947*0Sstevel@tonic-gate if (pii != NULL) { 1948*0Sstevel@tonic-gate for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1949*0Sstevel@tonic-gate tg->tg_crtt = 0; 1950*0Sstevel@tonic-gate tg->tg_rtt_sa = -1; 1951*0Sstevel@tonic-gate tg->tg_rtt_sd = 0; 1952*0Sstevel@tonic-gate } 1953*0Sstevel@tonic-gate } 1954*0Sstevel@tonic-gate 1955*0Sstevel@tonic-gate pii = pi->pi_v6; 1956*0Sstevel@tonic-gate if (pii != NULL) { 1957*0Sstevel@tonic-gate for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1958*0Sstevel@tonic-gate tg->tg_crtt = 0; 1959*0Sstevel@tonic-gate tg->tg_rtt_sa = -1; 1960*0Sstevel@tonic-gate tg->tg_rtt_sd = 0; 1961*0Sstevel@tonic-gate } 1962*0Sstevel@tonic-gate } 1963*0Sstevel@tonic-gate } 1964*0Sstevel@tonic-gate 1965*0Sstevel@tonic-gate /* 1966*0Sstevel@tonic-gate * Check if the phyint has failed the last NUM_PROBE_FAILS consecutive 1967*0Sstevel@tonic-gate * probes on both instances IPv4 and IPv6. 1968*0Sstevel@tonic-gate * If the interface has failed, return the time of the first probe failure 1969*0Sstevel@tonic-gate * in "tff". 1970*0Sstevel@tonic-gate */ 1971*0Sstevel@tonic-gate static int 1972*0Sstevel@tonic-gate phyint_inst_probe_failure_state(struct phyint_instance *pii, uint_t *tff) 1973*0Sstevel@tonic-gate { 1974*0Sstevel@tonic-gate uint_t pi_tff; 1975*0Sstevel@tonic-gate struct target *cur_tg; 1976*0Sstevel@tonic-gate struct probe_fail_count pfinfo; 1977*0Sstevel@tonic-gate struct phyint_instance *pii_other; 1978*0Sstevel@tonic-gate int pr_ndx; 1979*0Sstevel@tonic-gate 1980*0Sstevel@tonic-gate /* 1981*0Sstevel@tonic-gate * Get the number of consecutive failed probes on 1982*0Sstevel@tonic-gate * this phyint across all targets. Also get the number 1983*0Sstevel@tonic-gate * of consecutive failed probes on this target only 1984*0Sstevel@tonic-gate */ 1985*0Sstevel@tonic-gate pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 1986*0Sstevel@tonic-gate cur_tg = pii->pii_probes[pr_ndx].pr_target; 1987*0Sstevel@tonic-gate probe_fail_info(pii, cur_tg, &pfinfo); 1988*0Sstevel@tonic-gate 1989*0Sstevel@tonic-gate /* Get the time of first failure, for later use */ 1990*0Sstevel@tonic-gate pi_tff = pfinfo.pf_tff; 1991*0Sstevel@tonic-gate 1992*0Sstevel@tonic-gate /* 1993*0Sstevel@tonic-gate * If the current target has not responded to the 1994*0Sstevel@tonic-gate * last NUM_PROBE_FAILS probes, and other targets are 1995*0Sstevel@tonic-gate * responding delete this target. Dead gateway detection 1996*0Sstevel@tonic-gate * will eventually remove this target (if router) from the 1997*0Sstevel@tonic-gate * routing tables. If that does not occur, we may end 1998*0Sstevel@tonic-gate * up adding this to our list again. 1999*0Sstevel@tonic-gate */ 2000*0Sstevel@tonic-gate if (pfinfo.pf_nfail < NUM_PROBE_FAILS && 2001*0Sstevel@tonic-gate pfinfo.pf_nfail_tg >= NUM_PROBE_FAILS) { 2002*0Sstevel@tonic-gate if (pii->pii_targets_are_routers) { 2003*0Sstevel@tonic-gate if (cur_tg->tg_status == TG_ACTIVE) 2004*0Sstevel@tonic-gate pii->pii_ntargets--; 2005*0Sstevel@tonic-gate cur_tg->tg_status = TG_DEAD; 2006*0Sstevel@tonic-gate cur_tg->tg_crtt = 0; 2007*0Sstevel@tonic-gate cur_tg->tg_rtt_sa = -1; 2008*0Sstevel@tonic-gate cur_tg->tg_rtt_sd = 0; 2009*0Sstevel@tonic-gate if (pii->pii_target_next == cur_tg) 2010*0Sstevel@tonic-gate pii->pii_target_next = target_next(cur_tg); 2011*0Sstevel@tonic-gate } else { 2012*0Sstevel@tonic-gate target_delete(cur_tg); 2013*0Sstevel@tonic-gate probe(pii, PROBE_MULTI, getcurrenttime()); 2014*0Sstevel@tonic-gate } 2015*0Sstevel@tonic-gate return (PHYINT_OK); 2016*0Sstevel@tonic-gate } 2017*0Sstevel@tonic-gate 2018*0Sstevel@tonic-gate /* 2019*0Sstevel@tonic-gate * If the phyint has lost NUM_PROBE_FAILS or more 2020*0Sstevel@tonic-gate * consecutive probes, on both IPv4 and IPv6 protocol 2021*0Sstevel@tonic-gate * instances of the phyint, then trigger failure 2022*0Sstevel@tonic-gate * detection, else return false 2023*0Sstevel@tonic-gate */ 2024*0Sstevel@tonic-gate if (pfinfo.pf_nfail < NUM_PROBE_FAILS) 2025*0Sstevel@tonic-gate return (PHYINT_OK); 2026*0Sstevel@tonic-gate 2027*0Sstevel@tonic-gate pii_other = phyint_inst_other(pii); 2028*0Sstevel@tonic-gate if (PROBE_CAPABLE(pii_other)) { 2029*0Sstevel@tonic-gate probe_fail_info(pii_other, NULL, &pfinfo); 2030*0Sstevel@tonic-gate if (pfinfo.pf_nfail >= NUM_PROBE_FAILS) { 2031*0Sstevel@tonic-gate /* 2032*0Sstevel@tonic-gate * We have NUM_PROBE_FAILS or more failures 2033*0Sstevel@tonic-gate * on both IPv4 and IPv6. Get the earliest 2034*0Sstevel@tonic-gate * time when failure was detected on this 2035*0Sstevel@tonic-gate * phyint across IPv4 and IPv6. 2036*0Sstevel@tonic-gate */ 2037*0Sstevel@tonic-gate if (TIME_LT(pfinfo.pf_tff, pi_tff)) 2038*0Sstevel@tonic-gate pi_tff = pfinfo.pf_tff; 2039*0Sstevel@tonic-gate } else { 2040*0Sstevel@tonic-gate /* 2041*0Sstevel@tonic-gate * This instance has < NUM_PROBE_FAILS failure. 2042*0Sstevel@tonic-gate * So return false 2043*0Sstevel@tonic-gate */ 2044*0Sstevel@tonic-gate return (PHYINT_OK); 2045*0Sstevel@tonic-gate } 2046*0Sstevel@tonic-gate } 2047*0Sstevel@tonic-gate *tff = pi_tff; 2048*0Sstevel@tonic-gate return (PHYINT_FAILURE); 2049*0Sstevel@tonic-gate } 2050*0Sstevel@tonic-gate 2051*0Sstevel@tonic-gate /* 2052*0Sstevel@tonic-gate * Check if the link has gone down on this phyint, or it has failed the 2053*0Sstevel@tonic-gate * last NUM_PROBE_FAILS consecutive probes on both instances IPv4 and IPv6. 2054*0Sstevel@tonic-gate * Also look at other phyints of this group, for group failures. 2055*0Sstevel@tonic-gate */ 2056*0Sstevel@tonic-gate int 2057*0Sstevel@tonic-gate failure_state(struct phyint_instance *pii) 2058*0Sstevel@tonic-gate { 2059*0Sstevel@tonic-gate struct probe_success_count psinfo; 2060*0Sstevel@tonic-gate uint_t pi2_tls; /* time last success */ 2061*0Sstevel@tonic-gate uint_t pi_tff; /* time first fail */ 2062*0Sstevel@tonic-gate struct phyint *pi2; 2063*0Sstevel@tonic-gate struct phyint *pi; 2064*0Sstevel@tonic-gate struct phyint_instance *pii2; 2065*0Sstevel@tonic-gate struct phyint_group *pg; 2066*0Sstevel@tonic-gate boolean_t alone; 2067*0Sstevel@tonic-gate 2068*0Sstevel@tonic-gate if (debug & D_FAILOVER) 2069*0Sstevel@tonic-gate logdebug("phyint_failed(%s)\n", pii->pii_name); 2070*0Sstevel@tonic-gate 2071*0Sstevel@tonic-gate pi = pii->pii_phyint; 2072*0Sstevel@tonic-gate pg = pi->pi_group; 2073*0Sstevel@tonic-gate 2074*0Sstevel@tonic-gate if (LINK_UP(pi) && phyint_inst_probe_failure_state(pii, &pi_tff) == 2075*0Sstevel@tonic-gate PHYINT_OK) 2076*0Sstevel@tonic-gate return (PHYINT_OK); 2077*0Sstevel@tonic-gate 2078*0Sstevel@tonic-gate /* 2079*0Sstevel@tonic-gate * At this point, the link is down, or the phyint is suspect, 2080*0Sstevel@tonic-gate * as it has lost NUM_PROBE_FAILS or more probes. If the phyint 2081*0Sstevel@tonic-gate * does not belong to any group, or is the only member of the 2082*0Sstevel@tonic-gate * group capable of being probed, return PHYINT_FAILURE. 2083*0Sstevel@tonic-gate */ 2084*0Sstevel@tonic-gate alone = _B_TRUE; 2085*0Sstevel@tonic-gate if (pg != phyint_anongroup) { 2086*0Sstevel@tonic-gate for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { 2087*0Sstevel@tonic-gate if (pi2 == pi) 2088*0Sstevel@tonic-gate continue; 2089*0Sstevel@tonic-gate if (PROBE_CAPABLE(pi2->pi_v4) || 2090*0Sstevel@tonic-gate PROBE_CAPABLE(pi2->pi_v6)) { 2091*0Sstevel@tonic-gate alone = _B_FALSE; 2092*0Sstevel@tonic-gate break; 2093*0Sstevel@tonic-gate } 2094*0Sstevel@tonic-gate } 2095*0Sstevel@tonic-gate } 2096*0Sstevel@tonic-gate if (alone) 2097*0Sstevel@tonic-gate return (PHYINT_FAILURE); 2098*0Sstevel@tonic-gate 2099*0Sstevel@tonic-gate /* 2100*0Sstevel@tonic-gate * Need to compare against other phyints of the same group 2101*0Sstevel@tonic-gate * to exclude group failures. If the failure was detected via 2102*0Sstevel@tonic-gate * probing, then if the time of last success (tls) of any 2103*0Sstevel@tonic-gate * phyint is more recent than the time of first fail (tff) of the 2104*0Sstevel@tonic-gate * phyint in question, and the link is up on the phyint, 2105*0Sstevel@tonic-gate * then it is a phyint failure. Otherwise it is a group failure. 2106*0Sstevel@tonic-gate * If failure was detected via a link down notification sent from 2107*0Sstevel@tonic-gate * the driver to IP, we see if any phyints in the group are still 2108*0Sstevel@tonic-gate * running and haven't received a link down notification. We 2109*0Sstevel@tonic-gate * will usually be processing the link down notification shortly 2110*0Sstevel@tonic-gate * after it was received, so there is no point looking at the tls 2111*0Sstevel@tonic-gate * of other phyints. 2112*0Sstevel@tonic-gate */ 2113*0Sstevel@tonic-gate for (pi2 = pg->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { 2114*0Sstevel@tonic-gate /* Exclude ourself from comparison */ 2115*0Sstevel@tonic-gate if (pi2 == pi) 2116*0Sstevel@tonic-gate continue; 2117*0Sstevel@tonic-gate 2118*0Sstevel@tonic-gate if (LINK_DOWN(pi)) { 2119*0Sstevel@tonic-gate /* 2120*0Sstevel@tonic-gate * We use FLAGS_TO_LINK_STATE() to test the 2121*0Sstevel@tonic-gate * flags directly, rather then LINK_UP() or 2122*0Sstevel@tonic-gate * LINK_DOWN(), as we may not have got round 2123*0Sstevel@tonic-gate * to processing the link state for the other 2124*0Sstevel@tonic-gate * phyints in the group yet. 2125*0Sstevel@tonic-gate * 2126*0Sstevel@tonic-gate * The check for PI_RUNNING and group 2127*0Sstevel@tonic-gate * failure handles the case when the 2128*0Sstevel@tonic-gate * group begins to recover. The first 2129*0Sstevel@tonic-gate * phyint to recover should not trigger 2130*0Sstevel@tonic-gate * a failover from the soon-to-recover 2131*0Sstevel@tonic-gate * other phyints to the first recovered 2132*0Sstevel@tonic-gate * phyint. PI_RUNNING will be set, and 2133*0Sstevel@tonic-gate * pg_groupfailed cleared only after 2134*0Sstevel@tonic-gate * receipt of NUM_PROBE_REPAIRS, by 2135*0Sstevel@tonic-gate * which time the other phyints should 2136*0Sstevel@tonic-gate * have received at least 1 packet, 2137*0Sstevel@tonic-gate * and so will not have NUM_PROBE_FAILS. 2138*0Sstevel@tonic-gate */ 2139*0Sstevel@tonic-gate if ((pi2->pi_state == PI_RUNNING) && 2140*0Sstevel@tonic-gate !GROUP_FAILED(pg) && FLAGS_TO_LINK_STATE(pi2)) 2141*0Sstevel@tonic-gate return (PHYINT_FAILURE); 2142*0Sstevel@tonic-gate } else { 2143*0Sstevel@tonic-gate /* 2144*0Sstevel@tonic-gate * Need to compare against both IPv4 and 2145*0Sstevel@tonic-gate * IPv6 instances. 2146*0Sstevel@tonic-gate */ 2147*0Sstevel@tonic-gate pii2 = pi2->pi_v4; 2148*0Sstevel@tonic-gate if (pii2 != NULL) { 2149*0Sstevel@tonic-gate probe_success_info(pii2, NULL, &psinfo); 2150*0Sstevel@tonic-gate if (psinfo.ps_tls_valid) { 2151*0Sstevel@tonic-gate pi2_tls = psinfo.ps_tls; 2152*0Sstevel@tonic-gate /* 2153*0Sstevel@tonic-gate * See comment above regarding check 2154*0Sstevel@tonic-gate * for PI_RUNNING and group failure. 2155*0Sstevel@tonic-gate */ 2156*0Sstevel@tonic-gate if (TIME_GT(pi2_tls, pi_tff) && 2157*0Sstevel@tonic-gate (pi2->pi_state == PI_RUNNING) && 2158*0Sstevel@tonic-gate !GROUP_FAILED(pg) && 2159*0Sstevel@tonic-gate FLAGS_TO_LINK_STATE(pi2)) 2160*0Sstevel@tonic-gate return (PHYINT_FAILURE); 2161*0Sstevel@tonic-gate } 2162*0Sstevel@tonic-gate } 2163*0Sstevel@tonic-gate 2164*0Sstevel@tonic-gate pii2 = pi2->pi_v6; 2165*0Sstevel@tonic-gate if (pii2 != NULL) { 2166*0Sstevel@tonic-gate probe_success_info(pii2, NULL, &psinfo); 2167*0Sstevel@tonic-gate if (psinfo.ps_tls_valid) { 2168*0Sstevel@tonic-gate pi2_tls = psinfo.ps_tls; 2169*0Sstevel@tonic-gate /* 2170*0Sstevel@tonic-gate * See comment above regarding check 2171*0Sstevel@tonic-gate * for PI_RUNNING and group failure. 2172*0Sstevel@tonic-gate */ 2173*0Sstevel@tonic-gate if (TIME_GT(pi2_tls, pi_tff) && 2174*0Sstevel@tonic-gate (pi2->pi_state == PI_RUNNING) && 2175*0Sstevel@tonic-gate !GROUP_FAILED(pg) && 2176*0Sstevel@tonic-gate FLAGS_TO_LINK_STATE(pi2)) 2177*0Sstevel@tonic-gate return (PHYINT_FAILURE); 2178*0Sstevel@tonic-gate } 2179*0Sstevel@tonic-gate } 2180*0Sstevel@tonic-gate } 2181*0Sstevel@tonic-gate } 2182*0Sstevel@tonic-gate 2183*0Sstevel@tonic-gate /* 2184*0Sstevel@tonic-gate * Change the group state to PG_FAILED if it's not already. 2185*0Sstevel@tonic-gate */ 2186*0Sstevel@tonic-gate if (!GROUP_FAILED(pg)) 2187*0Sstevel@tonic-gate phyint_group_chstate(pg, PG_FAILED); 2188*0Sstevel@tonic-gate 2189*0Sstevel@tonic-gate return (GROUP_FAILURE); 2190*0Sstevel@tonic-gate } 2191*0Sstevel@tonic-gate 2192*0Sstevel@tonic-gate /* 2193*0Sstevel@tonic-gate * Return the information associated with consecutive probe successes 2194*0Sstevel@tonic-gate * starting with the most recent probe. At most the last 2 probes can be 2195*0Sstevel@tonic-gate * in the unacknowledged state. All previous probes have either failed 2196*0Sstevel@tonic-gate * or succeeded. 2197*0Sstevel@tonic-gate */ 2198*0Sstevel@tonic-gate static void 2199*0Sstevel@tonic-gate probe_success_info(struct phyint_instance *pii, struct target *cur_tg, 2200*0Sstevel@tonic-gate struct probe_success_count *psinfo) 2201*0Sstevel@tonic-gate { 2202*0Sstevel@tonic-gate uint_t i; 2203*0Sstevel@tonic-gate struct probe_stats *pr_statp; 2204*0Sstevel@tonic-gate uint_t most_recent; 2205*0Sstevel@tonic-gate uint_t second_most_recent; 2206*0Sstevel@tonic-gate boolean_t pi_found_failure = _B_FALSE; 2207*0Sstevel@tonic-gate boolean_t tg_found_failure = _B_FALSE; 2208*0Sstevel@tonic-gate uint_t now; 2209*0Sstevel@tonic-gate uint_t timeout; 2210*0Sstevel@tonic-gate struct target *tg; 2211*0Sstevel@tonic-gate 2212*0Sstevel@tonic-gate if (debug & D_FAILOVER) 2213*0Sstevel@tonic-gate logdebug("probe_success_info(%s)\n", pii->pii_name); 2214*0Sstevel@tonic-gate 2215*0Sstevel@tonic-gate bzero(psinfo, sizeof (*psinfo)); 2216*0Sstevel@tonic-gate now = getcurrenttime(); 2217*0Sstevel@tonic-gate 2218*0Sstevel@tonic-gate /* 2219*0Sstevel@tonic-gate * Start with the most recent probe, and count the number 2220*0Sstevel@tonic-gate * of consecutive probe successes. Latch the number of successes 2221*0Sstevel@tonic-gate * on hitting a failure. 2222*0Sstevel@tonic-gate */ 2223*0Sstevel@tonic-gate most_recent = PROBE_INDEX_PREV(pii->pii_probe_next); 2224*0Sstevel@tonic-gate second_most_recent = PROBE_INDEX_PREV(most_recent); 2225*0Sstevel@tonic-gate 2226*0Sstevel@tonic-gate for (i = most_recent; i != pii->pii_probe_next; 2227*0Sstevel@tonic-gate i = PROBE_INDEX_PREV(i)) { 2228*0Sstevel@tonic-gate pr_statp = &pii->pii_probes[i]; 2229*0Sstevel@tonic-gate 2230*0Sstevel@tonic-gate switch (pr_statp->pr_status) { 2231*0Sstevel@tonic-gate case PR_UNACKED: 2232*0Sstevel@tonic-gate /* 2233*0Sstevel@tonic-gate * Only the most recent 2 probes can be unacknowledged 2234*0Sstevel@tonic-gate */ 2235*0Sstevel@tonic-gate assert(i == most_recent || i == second_most_recent); 2236*0Sstevel@tonic-gate 2237*0Sstevel@tonic-gate tg = pr_statp->pr_target; 2238*0Sstevel@tonic-gate assert(tg != NULL); 2239*0Sstevel@tonic-gate /* 2240*0Sstevel@tonic-gate * The crtt could be zero for some reason, 2241*0Sstevel@tonic-gate * Eg. the phyint could be failed. If the crtt is 2242*0Sstevel@tonic-gate * not available use the value of the group's probe 2243*0Sstevel@tonic-gate * interval which is a worst case estimate. 2244*0Sstevel@tonic-gate */ 2245*0Sstevel@tonic-gate if (tg->tg_crtt != 0) { 2246*0Sstevel@tonic-gate timeout = pr_statp->pr_time_sent + tg->tg_crtt; 2247*0Sstevel@tonic-gate } else { 2248*0Sstevel@tonic-gate timeout = pr_statp->pr_time_sent + 2249*0Sstevel@tonic-gate pii->pii_phyint->pi_group->pg_probeint; 2250*0Sstevel@tonic-gate } 2251*0Sstevel@tonic-gate 2252*0Sstevel@tonic-gate if (TIME_LT(timeout, now)) { 2253*0Sstevel@tonic-gate /* 2254*0Sstevel@tonic-gate * We hit a failure. Latch the total number of 2255*0Sstevel@tonic-gate * recent consecutive successes. 2256*0Sstevel@tonic-gate */ 2257*0Sstevel@tonic-gate pr_statp->pr_time_lost = timeout; 2258*0Sstevel@tonic-gate pr_statp->pr_status = PR_LOST; 2259*0Sstevel@tonic-gate pi_found_failure = _B_TRUE; 2260*0Sstevel@tonic-gate if (cur_tg != NULL && tg == cur_tg) { 2261*0Sstevel@tonic-gate /* 2262*0Sstevel@tonic-gate * We hit a failure for the desired 2263*0Sstevel@tonic-gate * target. Latch the number of recent 2264*0Sstevel@tonic-gate * consecutive successes for this target 2265*0Sstevel@tonic-gate */ 2266*0Sstevel@tonic-gate tg_found_failure = _B_TRUE; 2267*0Sstevel@tonic-gate } 2268*0Sstevel@tonic-gate } 2269*0Sstevel@tonic-gate break; 2270*0Sstevel@tonic-gate 2271*0Sstevel@tonic-gate case PR_ACKED: 2272*0Sstevel@tonic-gate /* 2273*0Sstevel@tonic-gate * Bump up the count of probe successes, if we 2274*0Sstevel@tonic-gate * have not seen any failure so far. 2275*0Sstevel@tonic-gate */ 2276*0Sstevel@tonic-gate if (!pi_found_failure) 2277*0Sstevel@tonic-gate psinfo->ps_nsucc++; 2278*0Sstevel@tonic-gate 2279*0Sstevel@tonic-gate if (cur_tg != NULL && pr_statp->pr_target == cur_tg && 2280*0Sstevel@tonic-gate !tg_found_failure) { 2281*0Sstevel@tonic-gate psinfo->ps_nsucc_tg++; 2282*0Sstevel@tonic-gate } 2283*0Sstevel@tonic-gate 2284*0Sstevel@tonic-gate /* 2285*0Sstevel@tonic-gate * Record the time of last success, if this is 2286*0Sstevel@tonic-gate * the most recent probe success. 2287*0Sstevel@tonic-gate */ 2288*0Sstevel@tonic-gate if (!psinfo->ps_tls_valid) { 2289*0Sstevel@tonic-gate psinfo->ps_tls = pr_statp->pr_time_acked; 2290*0Sstevel@tonic-gate psinfo->ps_tls_valid = _B_TRUE; 2291*0Sstevel@tonic-gate } 2292*0Sstevel@tonic-gate break; 2293*0Sstevel@tonic-gate 2294*0Sstevel@tonic-gate case PR_LOST: 2295*0Sstevel@tonic-gate /* 2296*0Sstevel@tonic-gate * We hit a failure. Latch the total number of 2297*0Sstevel@tonic-gate * recent consecutive successes. 2298*0Sstevel@tonic-gate */ 2299*0Sstevel@tonic-gate pi_found_failure = _B_TRUE; 2300*0Sstevel@tonic-gate if (cur_tg != NULL && pr_statp->pr_target == cur_tg) { 2301*0Sstevel@tonic-gate /* 2302*0Sstevel@tonic-gate * We hit a failure for the desired target. 2303*0Sstevel@tonic-gate * Latch the number of recent consecutive 2304*0Sstevel@tonic-gate * successes for this target 2305*0Sstevel@tonic-gate */ 2306*0Sstevel@tonic-gate tg_found_failure = _B_TRUE; 2307*0Sstevel@tonic-gate } 2308*0Sstevel@tonic-gate break; 2309*0Sstevel@tonic-gate 2310*0Sstevel@tonic-gate default: 2311*0Sstevel@tonic-gate return; 2312*0Sstevel@tonic-gate 2313*0Sstevel@tonic-gate } 2314*0Sstevel@tonic-gate } 2315*0Sstevel@tonic-gate } 2316*0Sstevel@tonic-gate 2317*0Sstevel@tonic-gate /* 2318*0Sstevel@tonic-gate * Return the information associated with consecutive probe failures 2319*0Sstevel@tonic-gate * starting with the most recent probe. Only the last 2 probes can be in the 2320*0Sstevel@tonic-gate * unacknowledged state. All previous probes have either failed or succeeded. 2321*0Sstevel@tonic-gate */ 2322*0Sstevel@tonic-gate static void 2323*0Sstevel@tonic-gate probe_fail_info(struct phyint_instance *pii, struct target *cur_tg, 2324*0Sstevel@tonic-gate struct probe_fail_count *pfinfo) 2325*0Sstevel@tonic-gate { 2326*0Sstevel@tonic-gate int i; 2327*0Sstevel@tonic-gate struct probe_stats *pr_statp; 2328*0Sstevel@tonic-gate boolean_t tg_found_success = _B_FALSE; 2329*0Sstevel@tonic-gate boolean_t pi_found_success = _B_FALSE; 2330*0Sstevel@tonic-gate int most_recent; 2331*0Sstevel@tonic-gate int second_most_recent; 2332*0Sstevel@tonic-gate uint_t now; 2333*0Sstevel@tonic-gate uint_t timeout; 2334*0Sstevel@tonic-gate struct target *tg; 2335*0Sstevel@tonic-gate 2336*0Sstevel@tonic-gate if (debug & D_FAILOVER) 2337*0Sstevel@tonic-gate logdebug("probe_fail_info(%s)\n", pii->pii_name); 2338*0Sstevel@tonic-gate 2339*0Sstevel@tonic-gate bzero(pfinfo, sizeof (*pfinfo)); 2340*0Sstevel@tonic-gate now = getcurrenttime(); 2341*0Sstevel@tonic-gate 2342*0Sstevel@tonic-gate /* 2343*0Sstevel@tonic-gate * Start with the most recent probe, and count the number 2344*0Sstevel@tonic-gate * of consecutive probe failures. Latch the number of failures 2345*0Sstevel@tonic-gate * on hitting a probe success. 2346*0Sstevel@tonic-gate */ 2347*0Sstevel@tonic-gate most_recent = PROBE_INDEX_PREV(pii->pii_probe_next); 2348*0Sstevel@tonic-gate second_most_recent = PROBE_INDEX_PREV(most_recent); 2349*0Sstevel@tonic-gate 2350*0Sstevel@tonic-gate for (i = most_recent; i != pii->pii_probe_next; 2351*0Sstevel@tonic-gate i = PROBE_INDEX_PREV(i)) { 2352*0Sstevel@tonic-gate pr_statp = &pii->pii_probes[i]; 2353*0Sstevel@tonic-gate 2354*0Sstevel@tonic-gate assert(PR_STATUS_VALID(pr_statp->pr_status)); 2355*0Sstevel@tonic-gate 2356*0Sstevel@tonic-gate switch (pr_statp->pr_status) { 2357*0Sstevel@tonic-gate case PR_UNACKED: 2358*0Sstevel@tonic-gate /* 2359*0Sstevel@tonic-gate * Only the most recent 2 probes can be unacknowledged 2360*0Sstevel@tonic-gate */ 2361*0Sstevel@tonic-gate assert(i == most_recent || i == second_most_recent); 2362*0Sstevel@tonic-gate 2363*0Sstevel@tonic-gate tg = pr_statp->pr_target; 2364*0Sstevel@tonic-gate /* 2365*0Sstevel@tonic-gate * Target is guaranteed to exist in the unack. state 2366*0Sstevel@tonic-gate */ 2367*0Sstevel@tonic-gate assert(tg != NULL); 2368*0Sstevel@tonic-gate /* 2369*0Sstevel@tonic-gate * The crtt could be zero for some reason, 2370*0Sstevel@tonic-gate * Eg. the phyint could be failed. If the crtt is 2371*0Sstevel@tonic-gate * not available use the group's probe interval, 2372*0Sstevel@tonic-gate * which is a worst case estimate. 2373*0Sstevel@tonic-gate */ 2374*0Sstevel@tonic-gate if (tg->tg_crtt != 0) { 2375*0Sstevel@tonic-gate timeout = pr_statp->pr_time_sent + tg->tg_crtt; 2376*0Sstevel@tonic-gate } else { 2377*0Sstevel@tonic-gate timeout = pr_statp->pr_time_sent + 2378*0Sstevel@tonic-gate pii->pii_phyint->pi_group->pg_probeint; 2379*0Sstevel@tonic-gate } 2380*0Sstevel@tonic-gate 2381*0Sstevel@tonic-gate if (TIME_GT(timeout, now)) 2382*0Sstevel@tonic-gate break; 2383*0Sstevel@tonic-gate 2384*0Sstevel@tonic-gate pr_statp->pr_time_lost = timeout; 2385*0Sstevel@tonic-gate pr_statp->pr_status = PR_LOST; 2386*0Sstevel@tonic-gate /* FALLTHRU */ 2387*0Sstevel@tonic-gate 2388*0Sstevel@tonic-gate case PR_LOST: 2389*0Sstevel@tonic-gate if (!pi_found_success) { 2390*0Sstevel@tonic-gate pfinfo->pf_nfail++; 2391*0Sstevel@tonic-gate pfinfo->pf_tff = pr_statp->pr_time_lost; 2392*0Sstevel@tonic-gate } 2393*0Sstevel@tonic-gate if (cur_tg != NULL && pr_statp->pr_target == cur_tg && 2394*0Sstevel@tonic-gate !tg_found_success) { 2395*0Sstevel@tonic-gate pfinfo->pf_nfail_tg++; 2396*0Sstevel@tonic-gate } 2397*0Sstevel@tonic-gate break; 2398*0Sstevel@tonic-gate 2399*0Sstevel@tonic-gate default: 2400*0Sstevel@tonic-gate /* 2401*0Sstevel@tonic-gate * We hit a success or unused slot. Latch the 2402*0Sstevel@tonic-gate * total number of recent consecutive failures. 2403*0Sstevel@tonic-gate */ 2404*0Sstevel@tonic-gate pi_found_success = _B_TRUE; 2405*0Sstevel@tonic-gate if (cur_tg != NULL && pr_statp->pr_target == cur_tg) { 2406*0Sstevel@tonic-gate /* 2407*0Sstevel@tonic-gate * We hit a success for the desired target. 2408*0Sstevel@tonic-gate * Latch the number of recent consecutive 2409*0Sstevel@tonic-gate * failures for this target 2410*0Sstevel@tonic-gate */ 2411*0Sstevel@tonic-gate tg_found_success = _B_TRUE; 2412*0Sstevel@tonic-gate } 2413*0Sstevel@tonic-gate } 2414*0Sstevel@tonic-gate } 2415*0Sstevel@tonic-gate } 2416*0Sstevel@tonic-gate 2417*0Sstevel@tonic-gate /* 2418*0Sstevel@tonic-gate * Check if the phyint has been repaired. If no test address has been 2419*0Sstevel@tonic-gate * configured, then consider the interface repaired if the link is up (unless 2420*0Sstevel@tonic-gate * the link is flapping; see below). Otherwise, look for proof of probes 2421*0Sstevel@tonic-gate * being sent and received. If last NUM_PROBE_REPAIRS probes are fine on 2422*0Sstevel@tonic-gate * either IPv4 or IPv6 instance, the phyint can be considered repaired. 2423*0Sstevel@tonic-gate */ 2424*0Sstevel@tonic-gate static boolean_t 2425*0Sstevel@tonic-gate phyint_repaired(struct phyint *pi) 2426*0Sstevel@tonic-gate { 2427*0Sstevel@tonic-gate struct probe_success_count psinfo; 2428*0Sstevel@tonic-gate struct phyint_instance *pii; 2429*0Sstevel@tonic-gate struct target *cur_tg; 2430*0Sstevel@tonic-gate int pr_ndx; 2431*0Sstevel@tonic-gate uint_t cur_time; 2432*0Sstevel@tonic-gate 2433*0Sstevel@tonic-gate if (debug & D_FAILOVER) 2434*0Sstevel@tonic-gate logdebug("phyint_repaired(%s)\n", pi->pi_name); 2435*0Sstevel@tonic-gate 2436*0Sstevel@tonic-gate if (LINK_DOWN(pi)) 2437*0Sstevel@tonic-gate return (_B_FALSE); 2438*0Sstevel@tonic-gate 2439*0Sstevel@tonic-gate /* 2440*0Sstevel@tonic-gate * If we don't have any test addresses and the link is up, then 2441*0Sstevel@tonic-gate * consider the interface repaired, unless we've received more than 2442*0Sstevel@tonic-gate * LINK_UP_PERMIN link up notifications in the last minute, in 2443*0Sstevel@tonic-gate * which case we keep the link down until we drop back below 2444*0Sstevel@tonic-gate * the threshold. 2445*0Sstevel@tonic-gate */ 2446*0Sstevel@tonic-gate if (!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) { 2447*0Sstevel@tonic-gate cur_time = getcurrenttime(); 2448*0Sstevel@tonic-gate if ((pi->pi_whenup[pi->pi_whendx] == 0 || 2449*0Sstevel@tonic-gate (cur_time - pi->pi_whenup[pi->pi_whendx]) > MSEC_PERMIN)) { 2450*0Sstevel@tonic-gate pi->pi_lfmsg_printed = 0; 2451*0Sstevel@tonic-gate return (_B_TRUE); 2452*0Sstevel@tonic-gate } 2453*0Sstevel@tonic-gate if (!pi->pi_lfmsg_printed) { 2454*0Sstevel@tonic-gate logerr("The link has come up on %s more than %d times " 2455*0Sstevel@tonic-gate "in the last minute; disabling failback until it " 2456*0Sstevel@tonic-gate "stabilizes\n", pi->pi_name, LINK_UP_PERMIN); 2457*0Sstevel@tonic-gate pi->pi_lfmsg_printed = 1; 2458*0Sstevel@tonic-gate } 2459*0Sstevel@tonic-gate 2460*0Sstevel@tonic-gate return (_B_FALSE); 2461*0Sstevel@tonic-gate } 2462*0Sstevel@tonic-gate 2463*0Sstevel@tonic-gate pii = pi->pi_v4; 2464*0Sstevel@tonic-gate if (PROBE_CAPABLE(pii)) { 2465*0Sstevel@tonic-gate pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 2466*0Sstevel@tonic-gate cur_tg = pii->pii_probes[pr_ndx].pr_target; 2467*0Sstevel@tonic-gate probe_success_info(pii, cur_tg, &psinfo); 2468*0Sstevel@tonic-gate if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS || 2469*0Sstevel@tonic-gate psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS) 2470*0Sstevel@tonic-gate return (_B_TRUE); 2471*0Sstevel@tonic-gate } 2472*0Sstevel@tonic-gate 2473*0Sstevel@tonic-gate pii = pi->pi_v6; 2474*0Sstevel@tonic-gate if (PROBE_CAPABLE(pii)) { 2475*0Sstevel@tonic-gate pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 2476*0Sstevel@tonic-gate cur_tg = pii->pii_probes[pr_ndx].pr_target; 2477*0Sstevel@tonic-gate probe_success_info(pii, cur_tg, &psinfo); 2478*0Sstevel@tonic-gate if (psinfo.ps_nsucc >= NUM_PROBE_REPAIRS || 2479*0Sstevel@tonic-gate psinfo.ps_nsucc_tg >= NUM_PROBE_REPAIRS) 2480*0Sstevel@tonic-gate return (_B_TRUE); 2481*0Sstevel@tonic-gate } 2482*0Sstevel@tonic-gate 2483*0Sstevel@tonic-gate return (_B_FALSE); 2484*0Sstevel@tonic-gate } 2485*0Sstevel@tonic-gate 2486*0Sstevel@tonic-gate /* 2487*0Sstevel@tonic-gate * Try failover from phyint 'pi' to a suitable destination. 2488*0Sstevel@tonic-gate */ 2489*0Sstevel@tonic-gate int 2490*0Sstevel@tonic-gate try_failover(struct phyint *pi, int failover_type) 2491*0Sstevel@tonic-gate { 2492*0Sstevel@tonic-gate struct phyint *dst; 2493*0Sstevel@tonic-gate int err; 2494*0Sstevel@tonic-gate 2495*0Sstevel@tonic-gate if (debug & D_FAILOVER) 2496*0Sstevel@tonic-gate logdebug("try_failover(%s %d)\n", pi->pi_name, failover_type); 2497*0Sstevel@tonic-gate 2498*0Sstevel@tonic-gate /* 2499*0Sstevel@tonic-gate * Attempt to find a failover destination 'dst'. 2500*0Sstevel@tonic-gate * dst will be null if any of the following is true 2501*0Sstevel@tonic-gate * Phyint is not part of a group OR 2502*0Sstevel@tonic-gate * Phyint is the only member of a group OR 2503*0Sstevel@tonic-gate * No suitable failover dst was available 2504*0Sstevel@tonic-gate */ 2505*0Sstevel@tonic-gate dst = get_failover_dst(pi, failover_type); 2506*0Sstevel@tonic-gate if (dst == NULL) 2507*0Sstevel@tonic-gate return (IPMP_EMINRED); 2508*0Sstevel@tonic-gate 2509*0Sstevel@tonic-gate dst->pi_empty = 0; /* Per state diagram */ 2510*0Sstevel@tonic-gate pi->pi_full = 0; /* Per state diagram */ 2511*0Sstevel@tonic-gate 2512*0Sstevel@tonic-gate err = failover(pi, dst); 2513*0Sstevel@tonic-gate 2514*0Sstevel@tonic-gate if (debug & D_FAILOVER) { 2515*0Sstevel@tonic-gate logdebug("failed over from %s to %s ret %d\n", 2516*0Sstevel@tonic-gate pi->pi_name, dst->pi_name, err); 2517*0Sstevel@tonic-gate } 2518*0Sstevel@tonic-gate if (err == 0) { 2519*0Sstevel@tonic-gate pi->pi_empty = 1; /* Per state diagram */ 2520*0Sstevel@tonic-gate /* 2521*0Sstevel@tonic-gate * we don't want to print out this message if a 2522*0Sstevel@tonic-gate * phyint is leaving the group, nor for failover from 2523*0Sstevel@tonic-gate * standby 2524*0Sstevel@tonic-gate */ 2525*0Sstevel@tonic-gate if (failover_type == FAILOVER_NORMAL) { 2526*0Sstevel@tonic-gate logerr("Successfully failed over from NIC %s to NIC " 2527*0Sstevel@tonic-gate "%s\n", pi->pi_name, dst->pi_name); 2528*0Sstevel@tonic-gate } 2529*0Sstevel@tonic-gate return (0); 2530*0Sstevel@tonic-gate } else { 2531*0Sstevel@tonic-gate /* 2532*0Sstevel@tonic-gate * The failover did not succeed. We must retry the failover 2533*0Sstevel@tonic-gate * only after resyncing our state based on the kernel's. 2534*0Sstevel@tonic-gate * For eg. either the src or the dst might have been unplumbed 2535*0Sstevel@tonic-gate * causing this failure. initifs() will be called again, 2536*0Sstevel@tonic-gate * from main, since full_scan_required has been set to true 2537*0Sstevel@tonic-gate * by failover(); 2538*0Sstevel@tonic-gate */ 2539*0Sstevel@tonic-gate return (IPMP_FAILURE); 2540*0Sstevel@tonic-gate } 2541*0Sstevel@tonic-gate } 2542*0Sstevel@tonic-gate 2543*0Sstevel@tonic-gate /* 2544*0Sstevel@tonic-gate * global_errno captures the errno value, if failover() or failback() 2545*0Sstevel@tonic-gate * fails. This is sent to if_mpadm(1M). 2546*0Sstevel@tonic-gate */ 2547*0Sstevel@tonic-gate int global_errno; 2548*0Sstevel@tonic-gate 2549*0Sstevel@tonic-gate /* 2550*0Sstevel@tonic-gate * Attempt failover from phyint 'from' to phyint 'to'. 2551*0Sstevel@tonic-gate * IP moves everything from phyint 'from' to phyint 'to'. 2552*0Sstevel@tonic-gate */ 2553*0Sstevel@tonic-gate static int 2554*0Sstevel@tonic-gate failover(struct phyint *from, struct phyint *to) 2555*0Sstevel@tonic-gate { 2556*0Sstevel@tonic-gate struct lifreq lifr; 2557*0Sstevel@tonic-gate int ret; 2558*0Sstevel@tonic-gate 2559*0Sstevel@tonic-gate if (debug & D_FAILOVER) { 2560*0Sstevel@tonic-gate logdebug("failing over from %s to %s\n", 2561*0Sstevel@tonic-gate from->pi_name, to->pi_name); 2562*0Sstevel@tonic-gate } 2563*0Sstevel@tonic-gate 2564*0Sstevel@tonic-gate /* 2565*0Sstevel@tonic-gate * Perform the failover. Both IPv4 and IPv6 are failed over 2566*0Sstevel@tonic-gate * using a single ioctl by passing in AF_UNSPEC family. 2567*0Sstevel@tonic-gate */ 2568*0Sstevel@tonic-gate lifr.lifr_addr.ss_family = AF_UNSPEC; 2569*0Sstevel@tonic-gate (void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name)); 2570*0Sstevel@tonic-gate lifr.lifr_movetoindex = to->pi_ifindex; 2571*0Sstevel@tonic-gate 2572*0Sstevel@tonic-gate ret = ioctl(ifsock_v4, SIOCLIFFAILOVER, (caddr_t)&lifr); 2573*0Sstevel@tonic-gate if (ret < 0) { 2574*0Sstevel@tonic-gate global_errno = errno; 2575*0Sstevel@tonic-gate logperror("failover: ioctl (failover)"); 2576*0Sstevel@tonic-gate } 2577*0Sstevel@tonic-gate 2578*0Sstevel@tonic-gate /* 2579*0Sstevel@tonic-gate * Set full_scan_required to true. This will make us read 2580*0Sstevel@tonic-gate * the state from the kernel in initifs() and update our tables, 2581*0Sstevel@tonic-gate * to reflect the current state after the failover. If the 2582*0Sstevel@tonic-gate * failover has failed it will then reissue the failover. 2583*0Sstevel@tonic-gate */ 2584*0Sstevel@tonic-gate full_scan_required = _B_TRUE; 2585*0Sstevel@tonic-gate return (ret); 2586*0Sstevel@tonic-gate } 2587*0Sstevel@tonic-gate 2588*0Sstevel@tonic-gate /* 2589*0Sstevel@tonic-gate * phyint 'pi' has recovered. Attempt failback from every phyint in the same 2590*0Sstevel@tonic-gate * group as phyint 'pi' that is a potential failback source, to phyint 'pi'. 2591*0Sstevel@tonic-gate * Return values: 2592*0Sstevel@tonic-gate * IPMP_SUCCESS: Failback successful from each of the other 2593*0Sstevel@tonic-gate * phyints in the group. 2594*0Sstevel@tonic-gate * IPMP_EFBPARTIAL: Failback successful from some of the other 2595*0Sstevel@tonic-gate * phyints in the group. 2596*0Sstevel@tonic-gate * IPMP_FAILURE: Failback syscall failed with some error. 2597*0Sstevel@tonic-gate * 2598*0Sstevel@tonic-gate * Note that failback is attempted regardless of the setting of the 2599*0Sstevel@tonic-gate * failback_enabled flag. 2600*0Sstevel@tonic-gate */ 2601*0Sstevel@tonic-gate int 2602*0Sstevel@tonic-gate do_failback(struct phyint *pi, boolean_t check_only) 2603*0Sstevel@tonic-gate { 2604*0Sstevel@tonic-gate struct phyint *from; 2605*0Sstevel@tonic-gate boolean_t done; 2606*0Sstevel@tonic-gate boolean_t partial; 2607*0Sstevel@tonic-gate boolean_t attempted_failback = _B_FALSE; 2608*0Sstevel@tonic-gate 2609*0Sstevel@tonic-gate if (debug & D_FAILOVER) 2610*0Sstevel@tonic-gate logdebug("do_failback(%s)\n", pi->pi_name); 2611*0Sstevel@tonic-gate 2612*0Sstevel@tonic-gate /* If this phyint is not part of a named group, return. */ 2613*0Sstevel@tonic-gate if (pi->pi_group == phyint_anongroup) { 2614*0Sstevel@tonic-gate pi->pi_full = 1; 2615*0Sstevel@tonic-gate return (IPMP_SUCCESS); 2616*0Sstevel@tonic-gate } 2617*0Sstevel@tonic-gate 2618*0Sstevel@tonic-gate /* 2619*0Sstevel@tonic-gate * Attempt failback from every phyint in the group to 'pi'. 2620*0Sstevel@tonic-gate * The reason for doing this, instead of only from the 2621*0Sstevel@tonic-gate * phyint to which we did the failover is given below. 2622*0Sstevel@tonic-gate * 2623*0Sstevel@tonic-gate * After 'pi' failed, if any app. tries to join on a multicast 2624*0Sstevel@tonic-gate * address (IPv6), on the failed phyint, IP picks any arbitrary 2625*0Sstevel@tonic-gate * non-failed phyint in the group, instead of the failed phyint, 2626*0Sstevel@tonic-gate * in.mpathd is not aware of this. Thus failing back only from the 2627*0Sstevel@tonic-gate * interface to which 'pi' failed over, will failback the ipif's 2628*0Sstevel@tonic-gate * but not the ilm's. So we need to failback from all members of 2629*0Sstevel@tonic-gate * the phyint group 2630*0Sstevel@tonic-gate */ 2631*0Sstevel@tonic-gate done = _B_TRUE; 2632*0Sstevel@tonic-gate partial = _B_FALSE; 2633*0Sstevel@tonic-gate for (from = pi->pi_group->pg_phyint; from != NULL; 2634*0Sstevel@tonic-gate from = from->pi_pgnext) { 2635*0Sstevel@tonic-gate /* Exclude ourself as a failback src */ 2636*0Sstevel@tonic-gate if (from == pi) 2637*0Sstevel@tonic-gate continue; 2638*0Sstevel@tonic-gate 2639*0Sstevel@tonic-gate /* 2640*0Sstevel@tonic-gate * If the 'from' phyint has IPv4 plumbed, the 'to' 2641*0Sstevel@tonic-gate * phyint must also have IPv4 plumbed. Similar check 2642*0Sstevel@tonic-gate * for IPv6. IP makes the same check. Otherwise the 2643*0Sstevel@tonic-gate * failback will fail. 2644*0Sstevel@tonic-gate */ 2645*0Sstevel@tonic-gate if ((from->pi_v4 != NULL && pi->pi_v4 == NULL) || 2646*0Sstevel@tonic-gate (from->pi_v6 != NULL && pi->pi_v6 == NULL)) { 2647*0Sstevel@tonic-gate partial = _B_TRUE; 2648*0Sstevel@tonic-gate continue; 2649*0Sstevel@tonic-gate } 2650*0Sstevel@tonic-gate 2651*0Sstevel@tonic-gate if (!check_only) { 2652*0Sstevel@tonic-gate pi->pi_empty = 0; /* Per state diagram */ 2653*0Sstevel@tonic-gate attempted_failback = _B_TRUE; 2654*0Sstevel@tonic-gate if (failback(from, pi) != 0) { 2655*0Sstevel@tonic-gate done = _B_FALSE; 2656*0Sstevel@tonic-gate break; 2657*0Sstevel@tonic-gate } 2658*0Sstevel@tonic-gate } 2659*0Sstevel@tonic-gate } 2660*0Sstevel@tonic-gate 2661*0Sstevel@tonic-gate if (check_only) { 2662*0Sstevel@tonic-gate return (partial ? IPMP_EFBPARTIAL : IPMP_SUCCESS); 2663*0Sstevel@tonic-gate } 2664*0Sstevel@tonic-gate 2665*0Sstevel@tonic-gate /* 2666*0Sstevel@tonic-gate * We are done. No more phyint from which we can src the failback 2667*0Sstevel@tonic-gate */ 2668*0Sstevel@tonic-gate if (done) { 2669*0Sstevel@tonic-gate if (!partial) 2670*0Sstevel@tonic-gate pi->pi_full = 1; /* Per state diagram */ 2671*0Sstevel@tonic-gate /* 2672*0Sstevel@tonic-gate * Don't print out a message unless there is a 2673*0Sstevel@tonic-gate * transition from FAILED to RUNNING. For eg. 2674*0Sstevel@tonic-gate * we don't want to print out this message if a 2675*0Sstevel@tonic-gate * phyint is leaving the group, or at startup 2676*0Sstevel@tonic-gate */ 2677*0Sstevel@tonic-gate if (attempted_failback && (pi->pi_flags & 2678*0Sstevel@tonic-gate (IFF_FAILED | IFF_OFFLINE))) { 2679*0Sstevel@tonic-gate logerr("Successfully failed back to NIC %s\n", 2680*0Sstevel@tonic-gate pi->pi_name); 2681*0Sstevel@tonic-gate } 2682*0Sstevel@tonic-gate return (partial ? IPMP_EFBPARTIAL : IPMP_SUCCESS); 2683*0Sstevel@tonic-gate } 2684*0Sstevel@tonic-gate 2685*0Sstevel@tonic-gate return (IPMP_FAILURE); 2686*0Sstevel@tonic-gate } 2687*0Sstevel@tonic-gate 2688*0Sstevel@tonic-gate /* 2689*0Sstevel@tonic-gate * This function is similar to do_failback() above, but respects the 2690*0Sstevel@tonic-gate * failback_enabled flag for phyints in named groups. 2691*0Sstevel@tonic-gate */ 2692*0Sstevel@tonic-gate int 2693*0Sstevel@tonic-gate try_failback(struct phyint *pi, boolean_t check_only) 2694*0Sstevel@tonic-gate { 2695*0Sstevel@tonic-gate if (debug & D_FAILOVER) 2696*0Sstevel@tonic-gate logdebug("try_failback(%s)\n", pi->pi_name); 2697*0Sstevel@tonic-gate 2698*0Sstevel@tonic-gate if (pi->pi_group != phyint_anongroup && !failback_enabled) 2699*0Sstevel@tonic-gate return (IPMP_EFBDISABLED); 2700*0Sstevel@tonic-gate 2701*0Sstevel@tonic-gate return (do_failback(pi, check_only)); 2702*0Sstevel@tonic-gate } 2703*0Sstevel@tonic-gate 2704*0Sstevel@tonic-gate /* 2705*0Sstevel@tonic-gate * Failback everything from phyint 'from' that has the same ifindex 2706*0Sstevel@tonic-gate * as phyint to's ifindex. 2707*0Sstevel@tonic-gate */ 2708*0Sstevel@tonic-gate static int 2709*0Sstevel@tonic-gate failback(struct phyint *from, struct phyint *to) 2710*0Sstevel@tonic-gate { 2711*0Sstevel@tonic-gate struct lifreq lifr; 2712*0Sstevel@tonic-gate int ret; 2713*0Sstevel@tonic-gate 2714*0Sstevel@tonic-gate if (debug & D_FAILOVER) 2715*0Sstevel@tonic-gate logdebug("failback(%s %s)\n", from->pi_name, to->pi_name); 2716*0Sstevel@tonic-gate 2717*0Sstevel@tonic-gate lifr.lifr_addr.ss_family = AF_UNSPEC; 2718*0Sstevel@tonic-gate (void) strncpy(lifr.lifr_name, from->pi_name, sizeof (lifr.lifr_name)); 2719*0Sstevel@tonic-gate lifr.lifr_movetoindex = to->pi_ifindex; 2720*0Sstevel@tonic-gate 2721*0Sstevel@tonic-gate ret = ioctl(ifsock_v4, SIOCLIFFAILBACK, (caddr_t)&lifr); 2722*0Sstevel@tonic-gate if (ret < 0) { 2723*0Sstevel@tonic-gate global_errno = errno; 2724*0Sstevel@tonic-gate logperror("failback: ioctl (failback)"); 2725*0Sstevel@tonic-gate } 2726*0Sstevel@tonic-gate 2727*0Sstevel@tonic-gate /* 2728*0Sstevel@tonic-gate * Set full_scan_required to true. This will make us read 2729*0Sstevel@tonic-gate * the state from the kernel in initifs() and update our tables, 2730*0Sstevel@tonic-gate * to reflect the current state after the failback. If the 2731*0Sstevel@tonic-gate * failback has failed it will then reissue the failback. 2732*0Sstevel@tonic-gate */ 2733*0Sstevel@tonic-gate full_scan_required = _B_TRUE; 2734*0Sstevel@tonic-gate 2735*0Sstevel@tonic-gate return (ret); 2736*0Sstevel@tonic-gate } 2737*0Sstevel@tonic-gate 2738*0Sstevel@tonic-gate /* 2739*0Sstevel@tonic-gate * Select a target phyint for failing over from 'pi'. 2740*0Sstevel@tonic-gate * In the normal case i.e. failover_type is FAILOVER_NORMAL, the preferred 2741*0Sstevel@tonic-gate * target phyint is chosen as follows, 2742*0Sstevel@tonic-gate * 1. Pick any inactive standby interface. 2743*0Sstevel@tonic-gate * 2. If no inactive standby is available, select any phyint in the 2744*0Sstevel@tonic-gate * same group that has the least number of logints, (excluding 2745*0Sstevel@tonic-gate * IFF_NOFAILOVER and !IFF_UP logints) 2746*0Sstevel@tonic-gate * If we are failing over from a standby, failover_type is 2747*0Sstevel@tonic-gate * FAILOVER_TO_NONSTANDBY, and we won't pick a standby for the destination. 2748*0Sstevel@tonic-gate * If a phyint is leaving the group, then failover_type is FAILOVER_TO_ANY, 2749*0Sstevel@tonic-gate * and we won't return NULL, as long as there is at least 1 other phyint 2750*0Sstevel@tonic-gate * in the group. 2751*0Sstevel@tonic-gate */ 2752*0Sstevel@tonic-gate static struct phyint * 2753*0Sstevel@tonic-gate get_failover_dst(struct phyint *pi, int failover_type) 2754*0Sstevel@tonic-gate { 2755*0Sstevel@tonic-gate struct phyint *maybe = NULL; 2756*0Sstevel@tonic-gate struct phyint *pi2; 2757*0Sstevel@tonic-gate struct phyint *last_choice = NULL; 2758*0Sstevel@tonic-gate 2759*0Sstevel@tonic-gate if (pi->pi_group == phyint_anongroup) 2760*0Sstevel@tonic-gate return (NULL); 2761*0Sstevel@tonic-gate 2762*0Sstevel@tonic-gate /* 2763*0Sstevel@tonic-gate * Loop thru the phyints in the group, and pick the preferred 2764*0Sstevel@tonic-gate * phyint for the target. 2765*0Sstevel@tonic-gate */ 2766*0Sstevel@tonic-gate for (pi2 = pi->pi_group->pg_phyint; pi2 != NULL; pi2 = pi2->pi_pgnext) { 2767*0Sstevel@tonic-gate /* Exclude ourself and offlined interfaces */ 2768*0Sstevel@tonic-gate if (pi2 == pi || pi2->pi_state == PI_OFFLINE) 2769*0Sstevel@tonic-gate continue; 2770*0Sstevel@tonic-gate 2771*0Sstevel@tonic-gate /* 2772*0Sstevel@tonic-gate * The chosen target phyint must have IPv4 instance 2773*0Sstevel@tonic-gate * plumbed, if the src phyint has IPv4 plumbed. Similarly 2774*0Sstevel@tonic-gate * for IPv6. 2775*0Sstevel@tonic-gate */ 2776*0Sstevel@tonic-gate if ((pi2->pi_v4 == NULL && pi->pi_v4 != NULL) || 2777*0Sstevel@tonic-gate (pi2->pi_v6 == NULL && pi->pi_v6 != NULL)) 2778*0Sstevel@tonic-gate continue; 2779*0Sstevel@tonic-gate 2780*0Sstevel@tonic-gate /* The chosen target must be PI_RUNNING. */ 2781*0Sstevel@tonic-gate if (pi2->pi_state != PI_RUNNING) { 2782*0Sstevel@tonic-gate last_choice = pi2; 2783*0Sstevel@tonic-gate continue; 2784*0Sstevel@tonic-gate } 2785*0Sstevel@tonic-gate 2786*0Sstevel@tonic-gate if ((pi2->pi_flags & IFF_INACTIVE) && 2787*0Sstevel@tonic-gate (failover_type != FAILOVER_TO_NONSTANDBY)) { 2788*0Sstevel@tonic-gate return (pi2); 2789*0Sstevel@tonic-gate } else { 2790*0Sstevel@tonic-gate if (maybe == NULL) 2791*0Sstevel@tonic-gate maybe = pi2; 2792*0Sstevel@tonic-gate else if (logint_upcount(pi2) < logint_upcount(maybe)) 2793*0Sstevel@tonic-gate maybe = pi2; 2794*0Sstevel@tonic-gate } 2795*0Sstevel@tonic-gate } 2796*0Sstevel@tonic-gate if (maybe == NULL && failover_type == FAILOVER_TO_ANY) 2797*0Sstevel@tonic-gate return (last_choice); 2798*0Sstevel@tonic-gate else 2799*0Sstevel@tonic-gate return (maybe); 2800*0Sstevel@tonic-gate } 2801*0Sstevel@tonic-gate 2802*0Sstevel@tonic-gate /* 2803*0Sstevel@tonic-gate * Used to set/clear phyint flags, by making a SIOCSLIFFLAGS call. 2804*0Sstevel@tonic-gate */ 2805*0Sstevel@tonic-gate boolean_t 2806*0Sstevel@tonic-gate change_lif_flags(struct phyint *pi, uint64_t flags, boolean_t setfl) 2807*0Sstevel@tonic-gate { 2808*0Sstevel@tonic-gate int ifsock; 2809*0Sstevel@tonic-gate struct lifreq lifr; 2810*0Sstevel@tonic-gate 2811*0Sstevel@tonic-gate if (debug & D_FAILOVER) { 2812*0Sstevel@tonic-gate logdebug("change_lif_flags(%s): flags %llx setfl %d\n", 2813*0Sstevel@tonic-gate pi->pi_name, flags, (int)setfl); 2814*0Sstevel@tonic-gate } 2815*0Sstevel@tonic-gate 2816*0Sstevel@tonic-gate if (pi->pi_v4 != NULL) { 2817*0Sstevel@tonic-gate ifsock = ifsock_v4; 2818*0Sstevel@tonic-gate } else { 2819*0Sstevel@tonic-gate ifsock = ifsock_v6; 2820*0Sstevel@tonic-gate } 2821*0Sstevel@tonic-gate 2822*0Sstevel@tonic-gate /* 2823*0Sstevel@tonic-gate * Get the current flags from the kernel, and set/clear the 2824*0Sstevel@tonic-gate * desired phyint flags. Since we set only phyint flags, we can 2825*0Sstevel@tonic-gate * do it on either IPv4 or IPv6 instance. 2826*0Sstevel@tonic-gate */ 2827*0Sstevel@tonic-gate (void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); 2828*0Sstevel@tonic-gate lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; 2829*0Sstevel@tonic-gate if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) { 2830*0Sstevel@tonic-gate if (errno != ENXIO) 2831*0Sstevel@tonic-gate logperror("change_lif_flags: ioctl (get flags)"); 2832*0Sstevel@tonic-gate return (_B_FALSE); 2833*0Sstevel@tonic-gate } 2834*0Sstevel@tonic-gate if (setfl) 2835*0Sstevel@tonic-gate lifr.lifr_flags |= flags; 2836*0Sstevel@tonic-gate else 2837*0Sstevel@tonic-gate lifr.lifr_flags &= ~flags; 2838*0Sstevel@tonic-gate if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) { 2839*0Sstevel@tonic-gate if (errno != ENXIO) 2840*0Sstevel@tonic-gate logperror("change_lif_flags: ioctl (set flags)"); 2841*0Sstevel@tonic-gate return (_B_FALSE); 2842*0Sstevel@tonic-gate } 2843*0Sstevel@tonic-gate 2844*0Sstevel@tonic-gate /* 2845*0Sstevel@tonic-gate * Keep pi_flags in synch. with actual flags. Assumes flags are 2846*0Sstevel@tonic-gate * phyint flags. 2847*0Sstevel@tonic-gate */ 2848*0Sstevel@tonic-gate if (setfl) 2849*0Sstevel@tonic-gate pi->pi_flags |= flags; 2850*0Sstevel@tonic-gate else 2851*0Sstevel@tonic-gate pi->pi_flags &= ~flags; 2852*0Sstevel@tonic-gate 2853*0Sstevel@tonic-gate if (pi->pi_v4) 2854*0Sstevel@tonic-gate pi->pi_v4->pii_flags = pi->pi_flags; 2855*0Sstevel@tonic-gate 2856*0Sstevel@tonic-gate if (pi->pi_v6) 2857*0Sstevel@tonic-gate pi->pi_v6->pii_flags = pi->pi_flags; 2858*0Sstevel@tonic-gate 2859*0Sstevel@tonic-gate return (_B_TRUE); 2860*0Sstevel@tonic-gate } 2861*0Sstevel@tonic-gate 2862*0Sstevel@tonic-gate /* 2863*0Sstevel@tonic-gate * icmp cksum computation for IPv4. 2864*0Sstevel@tonic-gate */ 2865*0Sstevel@tonic-gate static int 2866*0Sstevel@tonic-gate in_cksum(ushort_t *addr, int len) 2867*0Sstevel@tonic-gate { 2868*0Sstevel@tonic-gate register int nleft = len; 2869*0Sstevel@tonic-gate register ushort_t *w = addr; 2870*0Sstevel@tonic-gate register ushort_t answer; 2871*0Sstevel@tonic-gate ushort_t odd_byte = 0; 2872*0Sstevel@tonic-gate register int sum = 0; 2873*0Sstevel@tonic-gate 2874*0Sstevel@tonic-gate /* 2875*0Sstevel@tonic-gate * Our algorithm is simple, using a 32 bit accumulator (sum), 2876*0Sstevel@tonic-gate * we add sequential 16 bit words to it, and at the end, fold 2877*0Sstevel@tonic-gate * back all the carry bits from the top 16 bits into the lower 2878*0Sstevel@tonic-gate * 16 bits. 2879*0Sstevel@tonic-gate */ 2880*0Sstevel@tonic-gate while (nleft > 1) { 2881*0Sstevel@tonic-gate sum += *w++; 2882*0Sstevel@tonic-gate nleft -= 2; 2883*0Sstevel@tonic-gate } 2884*0Sstevel@tonic-gate 2885*0Sstevel@tonic-gate /* mop up an odd byte, if necessary */ 2886*0Sstevel@tonic-gate if (nleft == 1) { 2887*0Sstevel@tonic-gate *(uchar_t *)(&odd_byte) = *(uchar_t *)w; 2888*0Sstevel@tonic-gate sum += odd_byte; 2889*0Sstevel@tonic-gate } 2890*0Sstevel@tonic-gate 2891*0Sstevel@tonic-gate /* 2892*0Sstevel@tonic-gate * add back carry outs from top 16 bits to low 16 bits 2893*0Sstevel@tonic-gate */ 2894*0Sstevel@tonic-gate sum = (sum >> 16) + (sum & 0xffff); /* add hi 16 to low 16 */ 2895*0Sstevel@tonic-gate sum += (sum >> 16); /* add carry */ 2896*0Sstevel@tonic-gate answer = ~sum; /* truncate to 16 bits */ 2897*0Sstevel@tonic-gate return (answer); 2898*0Sstevel@tonic-gate } 2899*0Sstevel@tonic-gate 2900*0Sstevel@tonic-gate static void 2901*0Sstevel@tonic-gate reset_snxt_basetimes(void) 2902*0Sstevel@tonic-gate { 2903*0Sstevel@tonic-gate struct phyint_instance *pii; 2904*0Sstevel@tonic-gate 2905*0Sstevel@tonic-gate for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 2906*0Sstevel@tonic-gate pii->pii_fd_snxt_basetime = pii->pii_snxt_basetime; 2907*0Sstevel@tonic-gate } 2908*0Sstevel@tonic-gate } 2909*0Sstevel@tonic-gate 2910*0Sstevel@tonic-gate /* 2911*0Sstevel@tonic-gate * Is the address one of our own addresses? Unfortunately, 2912*0Sstevel@tonic-gate * we cannot check our phyint tables to determine if the address 2913*0Sstevel@tonic-gate * is our own. This is because, we don't track interfaces that 2914*0Sstevel@tonic-gate * are not part of any group. We have to either use a 'bind' or 2915*0Sstevel@tonic-gate * get the complete list of all interfaces using SIOCGLIFCONF, 2916*0Sstevel@tonic-gate * to do this check. We choose to use 'bind'. We could use 2917*0Sstevel@tonic-gate * SIOCTMYADDR, but bind is preferred, since it is stronger. 2918*0Sstevel@tonic-gate * SIOCTMYADDR excludes down interfaces, while bind includes even 2919*0Sstevel@tonic-gate * down interfaces. 2920*0Sstevel@tonic-gate */ 2921*0Sstevel@tonic-gate boolean_t 2922*0Sstevel@tonic-gate own_address(int af, struct in6_addr addr) 2923*0Sstevel@tonic-gate { 2924*0Sstevel@tonic-gate int sock; 2925*0Sstevel@tonic-gate boolean_t ours = _B_TRUE; 2926*0Sstevel@tonic-gate 2927*0Sstevel@tonic-gate sock = socket(AF_INET6, SOCK_DGRAM, 0); 2928*0Sstevel@tonic-gate if (sock == -1) { 2929*0Sstevel@tonic-gate logperror("own_address: socket"); 2930*0Sstevel@tonic-gate /* 2931*0Sstevel@tonic-gate * If the socket call fails, err on the side of caution, 2932*0Sstevel@tonic-gate * and return true. 2933*0Sstevel@tonic-gate */ 2934*0Sstevel@tonic-gate } else { 2935*0Sstevel@tonic-gate struct sockaddr_in6 sin6; 2936*0Sstevel@tonic-gate 2937*0Sstevel@tonic-gate (void) memset(&sin6, 0, sizeof (struct sockaddr_in6)); 2938*0Sstevel@tonic-gate sin6.sin6_family = AF_INET6; 2939*0Sstevel@tonic-gate sin6.sin6_addr = addr; 2940*0Sstevel@tonic-gate /* 2941*0Sstevel@tonic-gate * If the bind succeeds, then this address is one of our 2942*0Sstevel@tonic-gate * addresses. 2943*0Sstevel@tonic-gate * If bind returns error EADDRNOTAVAIL, the address is 2944*0Sstevel@tonic-gate * not one of ours. 2945*0Sstevel@tonic-gate * If bind returns an error other than EADDRNOTAVAIL, err 2946*0Sstevel@tonic-gate * on the side of caution and report the address as one of 2947*0Sstevel@tonic-gate * our own. 2948*0Sstevel@tonic-gate */ 2949*0Sstevel@tonic-gate if (bind(sock, (struct sockaddr *)&sin6, 2950*0Sstevel@tonic-gate sizeof (struct sockaddr_in6)) == -1) { 2951*0Sstevel@tonic-gate if (errno == EADDRNOTAVAIL) 2952*0Sstevel@tonic-gate ours = _B_FALSE; 2953*0Sstevel@tonic-gate else 2954*0Sstevel@tonic-gate logperror("own_address: bind"); 2955*0Sstevel@tonic-gate } 2956*0Sstevel@tonic-gate (void) close(sock); 2957*0Sstevel@tonic-gate } 2958*0Sstevel@tonic-gate if (debug & D_TARGET) { 2959*0Sstevel@tonic-gate char abuf[INET6_ADDRSTRLEN]; 2960*0Sstevel@tonic-gate 2961*0Sstevel@tonic-gate logdebug("own_address: addr %s is %s ours\n", 2962*0Sstevel@tonic-gate pr_addr(af, addr, abuf, sizeof (abuf)), 2963*0Sstevel@tonic-gate ours ? "one of" : "not"); 2964*0Sstevel@tonic-gate } 2965*0Sstevel@tonic-gate return (ours); 2966*0Sstevel@tonic-gate } 2967