1*eda14cbcSMatt Macy /* 2*eda14cbcSMatt Macy * CDDL HEADER START 3*eda14cbcSMatt Macy * 4*eda14cbcSMatt Macy * This file and its contents are supplied under the terms of the 5*eda14cbcSMatt Macy * Common Development and Distribution License ("CDDL"), version 1.0. 6*eda14cbcSMatt Macy * You may only use this file in accordance with the terms of version 7*eda14cbcSMatt Macy * 1.0 of the CDDL. 8*eda14cbcSMatt Macy * 9*eda14cbcSMatt Macy * A full copy of the text of the CDDL should have accompanied this 10*eda14cbcSMatt Macy * source. A copy of the CDDL is also available via the Internet at 11*eda14cbcSMatt Macy * http://www.illumos.org/license/CDDL. 12*eda14cbcSMatt Macy * 13*eda14cbcSMatt Macy * CDDL HEADER END 14*eda14cbcSMatt Macy */ 15*eda14cbcSMatt Macy /* 16*eda14cbcSMatt Macy * Copyright (c) 2017, 2018 by Delphix. All rights reserved. 17*eda14cbcSMatt Macy */ 18*eda14cbcSMatt Macy 19*eda14cbcSMatt Macy #include <sys/zfs_context.h> 20*eda14cbcSMatt Macy #include <sys/aggsum.h> 21*eda14cbcSMatt Macy 22*eda14cbcSMatt Macy /* 23*eda14cbcSMatt Macy * Aggregate-sum counters are a form of fanned-out counter, used when atomic 24*eda14cbcSMatt Macy * instructions on a single field cause enough CPU cache line contention to 25*eda14cbcSMatt Macy * slow system performance. Due to their increased overhead and the expense 26*eda14cbcSMatt Macy * involved with precisely reading from them, they should only be used in cases 27*eda14cbcSMatt Macy * where the write rate (increment/decrement) is much higher than the read rate 28*eda14cbcSMatt Macy * (get value). 29*eda14cbcSMatt Macy * 30*eda14cbcSMatt Macy * Aggregate sum counters are comprised of two basic parts, the core and the 31*eda14cbcSMatt Macy * buckets. The core counter contains a lock for the entire counter, as well 32*eda14cbcSMatt Macy * as the current upper and lower bounds on the value of the counter. The 33*eda14cbcSMatt Macy * aggsum_bucket structure contains a per-bucket lock to protect the contents of 34*eda14cbcSMatt Macy * the bucket, the current amount that this bucket has changed from the global 35*eda14cbcSMatt Macy * counter (called the delta), and the amount of increment and decrement we have 36*eda14cbcSMatt Macy * "borrowed" from the core counter. 37*eda14cbcSMatt Macy * 38*eda14cbcSMatt Macy * The basic operation of an aggsum is simple. Threads that wish to modify the 39*eda14cbcSMatt Macy * counter will modify one bucket's counter (determined by their current CPU, to 40*eda14cbcSMatt Macy * help minimize lock and cache contention). If the bucket already has 41*eda14cbcSMatt Macy * sufficient capacity borrowed from the core structure to handle their request, 42*eda14cbcSMatt Macy * they simply modify the delta and return. If the bucket does not, we clear 43*eda14cbcSMatt Macy * the bucket's current state (to prevent the borrowed amounts from getting too 44*eda14cbcSMatt Macy * large), and borrow more from the core counter. Borrowing is done by adding to 45*eda14cbcSMatt Macy * the upper bound (or subtracting from the lower bound) of the core counter, 46*eda14cbcSMatt Macy * and setting the borrow value for the bucket to the amount added (or 47*eda14cbcSMatt Macy * subtracted). Clearing the bucket is the opposite; we add the current delta 48*eda14cbcSMatt Macy * to both the lower and upper bounds of the core counter, subtract the borrowed 49*eda14cbcSMatt Macy * incremental from the upper bound, and add the borrowed decrement from the 50*eda14cbcSMatt Macy * lower bound. Note that only borrowing and clearing require access to the 51*eda14cbcSMatt Macy * core counter; since all other operations access CPU-local resources, 52*eda14cbcSMatt Macy * performance can be much higher than a traditional counter. 53*eda14cbcSMatt Macy * 54*eda14cbcSMatt Macy * Threads that wish to read from the counter have a slightly more challenging 55*eda14cbcSMatt Macy * task. It is fast to determine the upper and lower bounds of the aggum; this 56*eda14cbcSMatt Macy * does not require grabbing any locks. This suffices for cases where an 57*eda14cbcSMatt Macy * approximation of the aggsum's value is acceptable. However, if one needs to 58*eda14cbcSMatt Macy * know whether some specific value is above or below the current value in the 59*eda14cbcSMatt Macy * aggsum, they invoke aggsum_compare(). This function operates by repeatedly 60*eda14cbcSMatt Macy * comparing the target value to the upper and lower bounds of the aggsum, and 61*eda14cbcSMatt Macy * then clearing a bucket. This proceeds until the target is outside of the 62*eda14cbcSMatt Macy * upper and lower bounds and we return a response, or the last bucket has been 63*eda14cbcSMatt Macy * cleared and we know that the target is equal to the aggsum's value. Finally, 64*eda14cbcSMatt Macy * the most expensive operation is determining the precise value of the aggsum. 65*eda14cbcSMatt Macy * To do this, we clear every bucket and then return the upper bound (which must 66*eda14cbcSMatt Macy * be equal to the lower bound). What makes aggsum_compare() and aggsum_value() 67*eda14cbcSMatt Macy * expensive is clearing buckets. This involves grabbing the global lock 68*eda14cbcSMatt Macy * (serializing against themselves and borrow operations), grabbing a bucket's 69*eda14cbcSMatt Macy * lock (preventing threads on those CPUs from modifying their delta), and 70*eda14cbcSMatt Macy * zeroing out the borrowed value (forcing that thread to borrow on its next 71*eda14cbcSMatt Macy * request, which will also be expensive). This is what makes aggsums well 72*eda14cbcSMatt Macy * suited for write-many read-rarely operations. 73*eda14cbcSMatt Macy */ 74*eda14cbcSMatt Macy 75*eda14cbcSMatt Macy /* 76*eda14cbcSMatt Macy * We will borrow aggsum_borrow_multiplier times the current request, so we will 77*eda14cbcSMatt Macy * have to get the as_lock approximately every aggsum_borrow_multiplier calls to 78*eda14cbcSMatt Macy * aggsum_delta(). 79*eda14cbcSMatt Macy */ 80*eda14cbcSMatt Macy static uint_t aggsum_borrow_multiplier = 10; 81*eda14cbcSMatt Macy 82*eda14cbcSMatt Macy void 83*eda14cbcSMatt Macy aggsum_init(aggsum_t *as, uint64_t value) 84*eda14cbcSMatt Macy { 85*eda14cbcSMatt Macy bzero(as, sizeof (*as)); 86*eda14cbcSMatt Macy as->as_lower_bound = as->as_upper_bound = value; 87*eda14cbcSMatt Macy mutex_init(&as->as_lock, NULL, MUTEX_DEFAULT, NULL); 88*eda14cbcSMatt Macy as->as_numbuckets = boot_ncpus; 89*eda14cbcSMatt Macy as->as_buckets = kmem_zalloc(boot_ncpus * sizeof (aggsum_bucket_t), 90*eda14cbcSMatt Macy KM_SLEEP); 91*eda14cbcSMatt Macy for (int i = 0; i < as->as_numbuckets; i++) { 92*eda14cbcSMatt Macy mutex_init(&as->as_buckets[i].asc_lock, 93*eda14cbcSMatt Macy NULL, MUTEX_DEFAULT, NULL); 94*eda14cbcSMatt Macy } 95*eda14cbcSMatt Macy } 96*eda14cbcSMatt Macy 97*eda14cbcSMatt Macy void 98*eda14cbcSMatt Macy aggsum_fini(aggsum_t *as) 99*eda14cbcSMatt Macy { 100*eda14cbcSMatt Macy for (int i = 0; i < as->as_numbuckets; i++) 101*eda14cbcSMatt Macy mutex_destroy(&as->as_buckets[i].asc_lock); 102*eda14cbcSMatt Macy kmem_free(as->as_buckets, as->as_numbuckets * sizeof (aggsum_bucket_t)); 103*eda14cbcSMatt Macy mutex_destroy(&as->as_lock); 104*eda14cbcSMatt Macy } 105*eda14cbcSMatt Macy 106*eda14cbcSMatt Macy int64_t 107*eda14cbcSMatt Macy aggsum_lower_bound(aggsum_t *as) 108*eda14cbcSMatt Macy { 109*eda14cbcSMatt Macy return (as->as_lower_bound); 110*eda14cbcSMatt Macy } 111*eda14cbcSMatt Macy 112*eda14cbcSMatt Macy int64_t 113*eda14cbcSMatt Macy aggsum_upper_bound(aggsum_t *as) 114*eda14cbcSMatt Macy { 115*eda14cbcSMatt Macy return (as->as_upper_bound); 116*eda14cbcSMatt Macy } 117*eda14cbcSMatt Macy 118*eda14cbcSMatt Macy static void 119*eda14cbcSMatt Macy aggsum_flush_bucket(aggsum_t *as, struct aggsum_bucket *asb) 120*eda14cbcSMatt Macy { 121*eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&as->as_lock)); 122*eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&asb->asc_lock)); 123*eda14cbcSMatt Macy 124*eda14cbcSMatt Macy /* 125*eda14cbcSMatt Macy * We use atomic instructions for this because we read the upper and 126*eda14cbcSMatt Macy * lower bounds without the lock, so we need stores to be atomic. 127*eda14cbcSMatt Macy */ 128*eda14cbcSMatt Macy atomic_add_64((volatile uint64_t *)&as->as_lower_bound, 129*eda14cbcSMatt Macy asb->asc_delta + asb->asc_borrowed); 130*eda14cbcSMatt Macy atomic_add_64((volatile uint64_t *)&as->as_upper_bound, 131*eda14cbcSMatt Macy asb->asc_delta - asb->asc_borrowed); 132*eda14cbcSMatt Macy asb->asc_delta = 0; 133*eda14cbcSMatt Macy asb->asc_borrowed = 0; 134*eda14cbcSMatt Macy } 135*eda14cbcSMatt Macy 136*eda14cbcSMatt Macy uint64_t 137*eda14cbcSMatt Macy aggsum_value(aggsum_t *as) 138*eda14cbcSMatt Macy { 139*eda14cbcSMatt Macy int64_t rv; 140*eda14cbcSMatt Macy 141*eda14cbcSMatt Macy mutex_enter(&as->as_lock); 142*eda14cbcSMatt Macy if (as->as_lower_bound == as->as_upper_bound) { 143*eda14cbcSMatt Macy rv = as->as_lower_bound; 144*eda14cbcSMatt Macy for (int i = 0; i < as->as_numbuckets; i++) { 145*eda14cbcSMatt Macy ASSERT0(as->as_buckets[i].asc_delta); 146*eda14cbcSMatt Macy ASSERT0(as->as_buckets[i].asc_borrowed); 147*eda14cbcSMatt Macy } 148*eda14cbcSMatt Macy mutex_exit(&as->as_lock); 149*eda14cbcSMatt Macy return (rv); 150*eda14cbcSMatt Macy } 151*eda14cbcSMatt Macy for (int i = 0; i < as->as_numbuckets; i++) { 152*eda14cbcSMatt Macy struct aggsum_bucket *asb = &as->as_buckets[i]; 153*eda14cbcSMatt Macy mutex_enter(&asb->asc_lock); 154*eda14cbcSMatt Macy aggsum_flush_bucket(as, asb); 155*eda14cbcSMatt Macy mutex_exit(&asb->asc_lock); 156*eda14cbcSMatt Macy } 157*eda14cbcSMatt Macy VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound); 158*eda14cbcSMatt Macy rv = as->as_lower_bound; 159*eda14cbcSMatt Macy mutex_exit(&as->as_lock); 160*eda14cbcSMatt Macy 161*eda14cbcSMatt Macy return (rv); 162*eda14cbcSMatt Macy } 163*eda14cbcSMatt Macy 164*eda14cbcSMatt Macy void 165*eda14cbcSMatt Macy aggsum_add(aggsum_t *as, int64_t delta) 166*eda14cbcSMatt Macy { 167*eda14cbcSMatt Macy struct aggsum_bucket *asb; 168*eda14cbcSMatt Macy int64_t borrow; 169*eda14cbcSMatt Macy 170*eda14cbcSMatt Macy kpreempt_disable(); 171*eda14cbcSMatt Macy asb = &as->as_buckets[CPU_SEQID % as->as_numbuckets]; 172*eda14cbcSMatt Macy kpreempt_enable(); 173*eda14cbcSMatt Macy 174*eda14cbcSMatt Macy /* Try fast path if we already borrowed enough before. */ 175*eda14cbcSMatt Macy mutex_enter(&asb->asc_lock); 176*eda14cbcSMatt Macy if (asb->asc_delta + delta <= (int64_t)asb->asc_borrowed && 177*eda14cbcSMatt Macy asb->asc_delta + delta >= -(int64_t)asb->asc_borrowed) { 178*eda14cbcSMatt Macy asb->asc_delta += delta; 179*eda14cbcSMatt Macy mutex_exit(&asb->asc_lock); 180*eda14cbcSMatt Macy return; 181*eda14cbcSMatt Macy } 182*eda14cbcSMatt Macy mutex_exit(&asb->asc_lock); 183*eda14cbcSMatt Macy 184*eda14cbcSMatt Macy /* 185*eda14cbcSMatt Macy * We haven't borrowed enough. Take the global lock and borrow 186*eda14cbcSMatt Macy * considering what is requested now and what we borrowed before. 187*eda14cbcSMatt Macy */ 188*eda14cbcSMatt Macy borrow = (delta < 0 ? -delta : delta) * aggsum_borrow_multiplier; 189*eda14cbcSMatt Macy mutex_enter(&as->as_lock); 190*eda14cbcSMatt Macy mutex_enter(&asb->asc_lock); 191*eda14cbcSMatt Macy delta += asb->asc_delta; 192*eda14cbcSMatt Macy asb->asc_delta = 0; 193*eda14cbcSMatt Macy if (borrow >= asb->asc_borrowed) 194*eda14cbcSMatt Macy borrow -= asb->asc_borrowed; 195*eda14cbcSMatt Macy else 196*eda14cbcSMatt Macy borrow = (borrow - (int64_t)asb->asc_borrowed) / 4; 197*eda14cbcSMatt Macy asb->asc_borrowed += borrow; 198*eda14cbcSMatt Macy atomic_add_64((volatile uint64_t *)&as->as_lower_bound, 199*eda14cbcSMatt Macy delta - borrow); 200*eda14cbcSMatt Macy atomic_add_64((volatile uint64_t *)&as->as_upper_bound, 201*eda14cbcSMatt Macy delta + borrow); 202*eda14cbcSMatt Macy mutex_exit(&asb->asc_lock); 203*eda14cbcSMatt Macy mutex_exit(&as->as_lock); 204*eda14cbcSMatt Macy } 205*eda14cbcSMatt Macy 206*eda14cbcSMatt Macy /* 207*eda14cbcSMatt Macy * Compare the aggsum value to target efficiently. Returns -1 if the value 208*eda14cbcSMatt Macy * represented by the aggsum is less than target, 1 if it's greater, and 0 if 209*eda14cbcSMatt Macy * they are equal. 210*eda14cbcSMatt Macy */ 211*eda14cbcSMatt Macy int 212*eda14cbcSMatt Macy aggsum_compare(aggsum_t *as, uint64_t target) 213*eda14cbcSMatt Macy { 214*eda14cbcSMatt Macy if (as->as_upper_bound < target) 215*eda14cbcSMatt Macy return (-1); 216*eda14cbcSMatt Macy if (as->as_lower_bound > target) 217*eda14cbcSMatt Macy return (1); 218*eda14cbcSMatt Macy mutex_enter(&as->as_lock); 219*eda14cbcSMatt Macy for (int i = 0; i < as->as_numbuckets; i++) { 220*eda14cbcSMatt Macy struct aggsum_bucket *asb = &as->as_buckets[i]; 221*eda14cbcSMatt Macy mutex_enter(&asb->asc_lock); 222*eda14cbcSMatt Macy aggsum_flush_bucket(as, asb); 223*eda14cbcSMatt Macy mutex_exit(&asb->asc_lock); 224*eda14cbcSMatt Macy if (as->as_upper_bound < target) { 225*eda14cbcSMatt Macy mutex_exit(&as->as_lock); 226*eda14cbcSMatt Macy return (-1); 227*eda14cbcSMatt Macy } 228*eda14cbcSMatt Macy if (as->as_lower_bound > target) { 229*eda14cbcSMatt Macy mutex_exit(&as->as_lock); 230*eda14cbcSMatt Macy return (1); 231*eda14cbcSMatt Macy } 232*eda14cbcSMatt Macy } 233*eda14cbcSMatt Macy VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound); 234*eda14cbcSMatt Macy ASSERT3U(as->as_lower_bound, ==, target); 235*eda14cbcSMatt Macy mutex_exit(&as->as_lock); 236*eda14cbcSMatt Macy return (0); 237*eda14cbcSMatt Macy } 238