1*eda14cbcSMatt Macy /* 2*eda14cbcSMatt Macy * CDDL HEADER START 3*eda14cbcSMatt Macy * 4*eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5*eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6*eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7*eda14cbcSMatt Macy * 8*eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*eda14cbcSMatt Macy * or http://www.opensolaris.org/os/licensing. 10*eda14cbcSMatt Macy * See the License for the specific language governing permissions 11*eda14cbcSMatt Macy * and limitations under the License. 12*eda14cbcSMatt Macy * 13*eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14*eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16*eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17*eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18*eda14cbcSMatt Macy * 19*eda14cbcSMatt Macy * CDDL HEADER END 20*eda14cbcSMatt Macy */ 21*eda14cbcSMatt Macy /* 22*eda14cbcSMatt Macy * Copyright (C) 2016 Gvozden Nešković. All rights reserved. 23*eda14cbcSMatt Macy */ 24*eda14cbcSMatt Macy 25*eda14cbcSMatt Macy #include <sys/zfs_context.h> 26*eda14cbcSMatt Macy #include <sys/types.h> 27*eda14cbcSMatt Macy #include <sys/zio.h> 28*eda14cbcSMatt Macy #include <sys/debug.h> 29*eda14cbcSMatt Macy #include <sys/zfs_debug.h> 30*eda14cbcSMatt Macy #include <sys/vdev_raidz.h> 31*eda14cbcSMatt Macy #include <sys/vdev_raidz_impl.h> 32*eda14cbcSMatt Macy #include <sys/simd.h> 33*eda14cbcSMatt Macy 34*eda14cbcSMatt Macy /* Opaque implementation with NULL methods to represent original methods */ 35*eda14cbcSMatt Macy static const raidz_impl_ops_t vdev_raidz_original_impl = { 36*eda14cbcSMatt Macy .name = "original", 37*eda14cbcSMatt Macy .is_supported = raidz_will_scalar_work, 38*eda14cbcSMatt Macy }; 39*eda14cbcSMatt Macy 40*eda14cbcSMatt Macy /* RAIDZ parity op that contain the fastest methods */ 41*eda14cbcSMatt Macy static raidz_impl_ops_t vdev_raidz_fastest_impl = { 42*eda14cbcSMatt Macy .name = "fastest" 43*eda14cbcSMatt Macy }; 44*eda14cbcSMatt Macy 45*eda14cbcSMatt Macy /* All compiled in implementations */ 46*eda14cbcSMatt Macy const raidz_impl_ops_t *raidz_all_maths[] = { 47*eda14cbcSMatt Macy &vdev_raidz_original_impl, 48*eda14cbcSMatt Macy &vdev_raidz_scalar_impl, 49*eda14cbcSMatt Macy #if defined(__x86_64) && defined(HAVE_SSE2) /* only x86_64 for now */ 50*eda14cbcSMatt Macy &vdev_raidz_sse2_impl, 51*eda14cbcSMatt Macy #endif 52*eda14cbcSMatt Macy #if defined(__x86_64) && defined(HAVE_SSSE3) /* only x86_64 for now */ 53*eda14cbcSMatt Macy &vdev_raidz_ssse3_impl, 54*eda14cbcSMatt Macy #endif 55*eda14cbcSMatt Macy #if defined(__x86_64) && defined(HAVE_AVX2) /* only x86_64 for now */ 56*eda14cbcSMatt Macy &vdev_raidz_avx2_impl, 57*eda14cbcSMatt Macy #endif 58*eda14cbcSMatt Macy #if defined(__x86_64) && defined(HAVE_AVX512F) /* only x86_64 for now */ 59*eda14cbcSMatt Macy &vdev_raidz_avx512f_impl, 60*eda14cbcSMatt Macy #endif 61*eda14cbcSMatt Macy #if defined(__x86_64) && defined(HAVE_AVX512BW) /* only x86_64 for now */ 62*eda14cbcSMatt Macy &vdev_raidz_avx512bw_impl, 63*eda14cbcSMatt Macy #endif 64*eda14cbcSMatt Macy #if defined(__aarch64__) 65*eda14cbcSMatt Macy &vdev_raidz_aarch64_neon_impl, 66*eda14cbcSMatt Macy &vdev_raidz_aarch64_neonx2_impl, 67*eda14cbcSMatt Macy #endif 68*eda14cbcSMatt Macy #if defined(__powerpc__) && defined(__altivec__) 69*eda14cbcSMatt Macy &vdev_raidz_powerpc_altivec_impl, 70*eda14cbcSMatt Macy #endif 71*eda14cbcSMatt Macy }; 72*eda14cbcSMatt Macy 73*eda14cbcSMatt Macy /* Indicate that benchmark has been completed */ 74*eda14cbcSMatt Macy static boolean_t raidz_math_initialized = B_FALSE; 75*eda14cbcSMatt Macy 76*eda14cbcSMatt Macy /* Select raidz implementation */ 77*eda14cbcSMatt Macy #define IMPL_FASTEST (UINT32_MAX) 78*eda14cbcSMatt Macy #define IMPL_CYCLE (UINT32_MAX - 1) 79*eda14cbcSMatt Macy #define IMPL_ORIGINAL (0) 80*eda14cbcSMatt Macy #define IMPL_SCALAR (1) 81*eda14cbcSMatt Macy 82*eda14cbcSMatt Macy #define RAIDZ_IMPL_READ(i) (*(volatile uint32_t *) &(i)) 83*eda14cbcSMatt Macy 84*eda14cbcSMatt Macy static uint32_t zfs_vdev_raidz_impl = IMPL_SCALAR; 85*eda14cbcSMatt Macy static uint32_t user_sel_impl = IMPL_FASTEST; 86*eda14cbcSMatt Macy 87*eda14cbcSMatt Macy /* Hold all supported implementations */ 88*eda14cbcSMatt Macy static size_t raidz_supp_impl_cnt = 0; 89*eda14cbcSMatt Macy static raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths)]; 90*eda14cbcSMatt Macy 91*eda14cbcSMatt Macy #if defined(_KERNEL) 92*eda14cbcSMatt Macy /* 93*eda14cbcSMatt Macy * kstats values for supported implementations 94*eda14cbcSMatt Macy * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s] 95*eda14cbcSMatt Macy */ 96*eda14cbcSMatt Macy static raidz_impl_kstat_t raidz_impl_kstats[ARRAY_SIZE(raidz_all_maths) + 1]; 97*eda14cbcSMatt Macy 98*eda14cbcSMatt Macy /* kstat for benchmarked implementations */ 99*eda14cbcSMatt Macy static kstat_t *raidz_math_kstat = NULL; 100*eda14cbcSMatt Macy #endif 101*eda14cbcSMatt Macy 102*eda14cbcSMatt Macy /* 103*eda14cbcSMatt Macy * Returns the RAIDZ operations for raidz_map() parity calculations. When 104*eda14cbcSMatt Macy * a SIMD implementation is not allowed in the current context, then fallback 105*eda14cbcSMatt Macy * to the fastest generic implementation. 106*eda14cbcSMatt Macy */ 107*eda14cbcSMatt Macy const raidz_impl_ops_t * 108*eda14cbcSMatt Macy vdev_raidz_math_get_ops(void) 109*eda14cbcSMatt Macy { 110*eda14cbcSMatt Macy if (!kfpu_allowed()) 111*eda14cbcSMatt Macy return (&vdev_raidz_scalar_impl); 112*eda14cbcSMatt Macy 113*eda14cbcSMatt Macy raidz_impl_ops_t *ops = NULL; 114*eda14cbcSMatt Macy const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl); 115*eda14cbcSMatt Macy 116*eda14cbcSMatt Macy switch (impl) { 117*eda14cbcSMatt Macy case IMPL_FASTEST: 118*eda14cbcSMatt Macy ASSERT(raidz_math_initialized); 119*eda14cbcSMatt Macy ops = &vdev_raidz_fastest_impl; 120*eda14cbcSMatt Macy break; 121*eda14cbcSMatt Macy case IMPL_CYCLE: 122*eda14cbcSMatt Macy /* Cycle through all supported implementations */ 123*eda14cbcSMatt Macy ASSERT(raidz_math_initialized); 124*eda14cbcSMatt Macy ASSERT3U(raidz_supp_impl_cnt, >, 0); 125*eda14cbcSMatt Macy static size_t cycle_impl_idx = 0; 126*eda14cbcSMatt Macy size_t idx = (++cycle_impl_idx) % raidz_supp_impl_cnt; 127*eda14cbcSMatt Macy ops = raidz_supp_impl[idx]; 128*eda14cbcSMatt Macy break; 129*eda14cbcSMatt Macy case IMPL_ORIGINAL: 130*eda14cbcSMatt Macy ops = (raidz_impl_ops_t *)&vdev_raidz_original_impl; 131*eda14cbcSMatt Macy break; 132*eda14cbcSMatt Macy case IMPL_SCALAR: 133*eda14cbcSMatt Macy ops = (raidz_impl_ops_t *)&vdev_raidz_scalar_impl; 134*eda14cbcSMatt Macy break; 135*eda14cbcSMatt Macy default: 136*eda14cbcSMatt Macy ASSERT3U(impl, <, raidz_supp_impl_cnt); 137*eda14cbcSMatt Macy ASSERT3U(raidz_supp_impl_cnt, >, 0); 138*eda14cbcSMatt Macy if (impl < ARRAY_SIZE(raidz_all_maths)) 139*eda14cbcSMatt Macy ops = raidz_supp_impl[impl]; 140*eda14cbcSMatt Macy break; 141*eda14cbcSMatt Macy } 142*eda14cbcSMatt Macy 143*eda14cbcSMatt Macy ASSERT3P(ops, !=, NULL); 144*eda14cbcSMatt Macy 145*eda14cbcSMatt Macy return (ops); 146*eda14cbcSMatt Macy } 147*eda14cbcSMatt Macy 148*eda14cbcSMatt Macy /* 149*eda14cbcSMatt Macy * Select parity generation method for raidz_map 150*eda14cbcSMatt Macy */ 151*eda14cbcSMatt Macy int 152*eda14cbcSMatt Macy vdev_raidz_math_generate(raidz_map_t *rm) 153*eda14cbcSMatt Macy { 154*eda14cbcSMatt Macy raidz_gen_f gen_parity = NULL; 155*eda14cbcSMatt Macy 156*eda14cbcSMatt Macy switch (raidz_parity(rm)) { 157*eda14cbcSMatt Macy case 1: 158*eda14cbcSMatt Macy gen_parity = rm->rm_ops->gen[RAIDZ_GEN_P]; 159*eda14cbcSMatt Macy break; 160*eda14cbcSMatt Macy case 2: 161*eda14cbcSMatt Macy gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQ]; 162*eda14cbcSMatt Macy break; 163*eda14cbcSMatt Macy case 3: 164*eda14cbcSMatt Macy gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQR]; 165*eda14cbcSMatt Macy break; 166*eda14cbcSMatt Macy default: 167*eda14cbcSMatt Macy gen_parity = NULL; 168*eda14cbcSMatt Macy cmn_err(CE_PANIC, "invalid RAID-Z configuration %d", 169*eda14cbcSMatt Macy raidz_parity(rm)); 170*eda14cbcSMatt Macy break; 171*eda14cbcSMatt Macy } 172*eda14cbcSMatt Macy 173*eda14cbcSMatt Macy /* if method is NULL execute the original implementation */ 174*eda14cbcSMatt Macy if (gen_parity == NULL) 175*eda14cbcSMatt Macy return (RAIDZ_ORIGINAL_IMPL); 176*eda14cbcSMatt Macy 177*eda14cbcSMatt Macy gen_parity(rm); 178*eda14cbcSMatt Macy 179*eda14cbcSMatt Macy return (0); 180*eda14cbcSMatt Macy } 181*eda14cbcSMatt Macy 182*eda14cbcSMatt Macy static raidz_rec_f 183*eda14cbcSMatt Macy reconstruct_fun_p_sel(raidz_map_t *rm, const int *parity_valid, 184*eda14cbcSMatt Macy const int nbaddata) 185*eda14cbcSMatt Macy { 186*eda14cbcSMatt Macy if (nbaddata == 1 && parity_valid[CODE_P]) { 187*eda14cbcSMatt Macy return (rm->rm_ops->rec[RAIDZ_REC_P]); 188*eda14cbcSMatt Macy } 189*eda14cbcSMatt Macy return ((raidz_rec_f) NULL); 190*eda14cbcSMatt Macy } 191*eda14cbcSMatt Macy 192*eda14cbcSMatt Macy static raidz_rec_f 193*eda14cbcSMatt Macy reconstruct_fun_pq_sel(raidz_map_t *rm, const int *parity_valid, 194*eda14cbcSMatt Macy const int nbaddata) 195*eda14cbcSMatt Macy { 196*eda14cbcSMatt Macy if (nbaddata == 1) { 197*eda14cbcSMatt Macy if (parity_valid[CODE_P]) { 198*eda14cbcSMatt Macy return (rm->rm_ops->rec[RAIDZ_REC_P]); 199*eda14cbcSMatt Macy } else if (parity_valid[CODE_Q]) { 200*eda14cbcSMatt Macy return (rm->rm_ops->rec[RAIDZ_REC_Q]); 201*eda14cbcSMatt Macy } 202*eda14cbcSMatt Macy } else if (nbaddata == 2 && 203*eda14cbcSMatt Macy parity_valid[CODE_P] && parity_valid[CODE_Q]) { 204*eda14cbcSMatt Macy return (rm->rm_ops->rec[RAIDZ_REC_PQ]); 205*eda14cbcSMatt Macy } 206*eda14cbcSMatt Macy return ((raidz_rec_f) NULL); 207*eda14cbcSMatt Macy } 208*eda14cbcSMatt Macy 209*eda14cbcSMatt Macy static raidz_rec_f 210*eda14cbcSMatt Macy reconstruct_fun_pqr_sel(raidz_map_t *rm, const int *parity_valid, 211*eda14cbcSMatt Macy const int nbaddata) 212*eda14cbcSMatt Macy { 213*eda14cbcSMatt Macy if (nbaddata == 1) { 214*eda14cbcSMatt Macy if (parity_valid[CODE_P]) { 215*eda14cbcSMatt Macy return (rm->rm_ops->rec[RAIDZ_REC_P]); 216*eda14cbcSMatt Macy } else if (parity_valid[CODE_Q]) { 217*eda14cbcSMatt Macy return (rm->rm_ops->rec[RAIDZ_REC_Q]); 218*eda14cbcSMatt Macy } else if (parity_valid[CODE_R]) { 219*eda14cbcSMatt Macy return (rm->rm_ops->rec[RAIDZ_REC_R]); 220*eda14cbcSMatt Macy } 221*eda14cbcSMatt Macy } else if (nbaddata == 2) { 222*eda14cbcSMatt Macy if (parity_valid[CODE_P] && parity_valid[CODE_Q]) { 223*eda14cbcSMatt Macy return (rm->rm_ops->rec[RAIDZ_REC_PQ]); 224*eda14cbcSMatt Macy } else if (parity_valid[CODE_P] && parity_valid[CODE_R]) { 225*eda14cbcSMatt Macy return (rm->rm_ops->rec[RAIDZ_REC_PR]); 226*eda14cbcSMatt Macy } else if (parity_valid[CODE_Q] && parity_valid[CODE_R]) { 227*eda14cbcSMatt Macy return (rm->rm_ops->rec[RAIDZ_REC_QR]); 228*eda14cbcSMatt Macy } 229*eda14cbcSMatt Macy } else if (nbaddata == 3 && 230*eda14cbcSMatt Macy parity_valid[CODE_P] && parity_valid[CODE_Q] && 231*eda14cbcSMatt Macy parity_valid[CODE_R]) { 232*eda14cbcSMatt Macy return (rm->rm_ops->rec[RAIDZ_REC_PQR]); 233*eda14cbcSMatt Macy } 234*eda14cbcSMatt Macy return ((raidz_rec_f) NULL); 235*eda14cbcSMatt Macy } 236*eda14cbcSMatt Macy 237*eda14cbcSMatt Macy /* 238*eda14cbcSMatt Macy * Select data reconstruction method for raidz_map 239*eda14cbcSMatt Macy * @parity_valid - Parity validity flag 240*eda14cbcSMatt Macy * @dt - Failed data index array 241*eda14cbcSMatt Macy * @nbaddata - Number of failed data columns 242*eda14cbcSMatt Macy */ 243*eda14cbcSMatt Macy int 244*eda14cbcSMatt Macy vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid, 245*eda14cbcSMatt Macy const int *dt, const int nbaddata) 246*eda14cbcSMatt Macy { 247*eda14cbcSMatt Macy raidz_rec_f rec_fn = NULL; 248*eda14cbcSMatt Macy 249*eda14cbcSMatt Macy switch (raidz_parity(rm)) { 250*eda14cbcSMatt Macy case PARITY_P: 251*eda14cbcSMatt Macy rec_fn = reconstruct_fun_p_sel(rm, parity_valid, nbaddata); 252*eda14cbcSMatt Macy break; 253*eda14cbcSMatt Macy case PARITY_PQ: 254*eda14cbcSMatt Macy rec_fn = reconstruct_fun_pq_sel(rm, parity_valid, nbaddata); 255*eda14cbcSMatt Macy break; 256*eda14cbcSMatt Macy case PARITY_PQR: 257*eda14cbcSMatt Macy rec_fn = reconstruct_fun_pqr_sel(rm, parity_valid, nbaddata); 258*eda14cbcSMatt Macy break; 259*eda14cbcSMatt Macy default: 260*eda14cbcSMatt Macy cmn_err(CE_PANIC, "invalid RAID-Z configuration %d", 261*eda14cbcSMatt Macy raidz_parity(rm)); 262*eda14cbcSMatt Macy break; 263*eda14cbcSMatt Macy } 264*eda14cbcSMatt Macy 265*eda14cbcSMatt Macy if (rec_fn == NULL) 266*eda14cbcSMatt Macy return (RAIDZ_ORIGINAL_IMPL); 267*eda14cbcSMatt Macy else 268*eda14cbcSMatt Macy return (rec_fn(rm, dt)); 269*eda14cbcSMatt Macy } 270*eda14cbcSMatt Macy 271*eda14cbcSMatt Macy const char *raidz_gen_name[] = { 272*eda14cbcSMatt Macy "gen_p", "gen_pq", "gen_pqr" 273*eda14cbcSMatt Macy }; 274*eda14cbcSMatt Macy const char *raidz_rec_name[] = { 275*eda14cbcSMatt Macy "rec_p", "rec_q", "rec_r", 276*eda14cbcSMatt Macy "rec_pq", "rec_pr", "rec_qr", "rec_pqr" 277*eda14cbcSMatt Macy }; 278*eda14cbcSMatt Macy 279*eda14cbcSMatt Macy #if defined(_KERNEL) 280*eda14cbcSMatt Macy 281*eda14cbcSMatt Macy #define RAIDZ_KSTAT_LINE_LEN (17 + 10*12 + 1) 282*eda14cbcSMatt Macy 283*eda14cbcSMatt Macy static int 284*eda14cbcSMatt Macy raidz_math_kstat_headers(char *buf, size_t size) 285*eda14cbcSMatt Macy { 286*eda14cbcSMatt Macy int i; 287*eda14cbcSMatt Macy ssize_t off; 288*eda14cbcSMatt Macy 289*eda14cbcSMatt Macy ASSERT3U(size, >=, RAIDZ_KSTAT_LINE_LEN); 290*eda14cbcSMatt Macy 291*eda14cbcSMatt Macy off = snprintf(buf, size, "%-17s", "implementation"); 292*eda14cbcSMatt Macy 293*eda14cbcSMatt Macy for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++) 294*eda14cbcSMatt Macy off += snprintf(buf + off, size - off, "%-16s", 295*eda14cbcSMatt Macy raidz_gen_name[i]); 296*eda14cbcSMatt Macy 297*eda14cbcSMatt Macy for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++) 298*eda14cbcSMatt Macy off += snprintf(buf + off, size - off, "%-16s", 299*eda14cbcSMatt Macy raidz_rec_name[i]); 300*eda14cbcSMatt Macy 301*eda14cbcSMatt Macy (void) snprintf(buf + off, size - off, "\n"); 302*eda14cbcSMatt Macy 303*eda14cbcSMatt Macy return (0); 304*eda14cbcSMatt Macy } 305*eda14cbcSMatt Macy 306*eda14cbcSMatt Macy static int 307*eda14cbcSMatt Macy raidz_math_kstat_data(char *buf, size_t size, void *data) 308*eda14cbcSMatt Macy { 309*eda14cbcSMatt Macy raidz_impl_kstat_t *fstat = &raidz_impl_kstats[raidz_supp_impl_cnt]; 310*eda14cbcSMatt Macy raidz_impl_kstat_t *cstat = (raidz_impl_kstat_t *)data; 311*eda14cbcSMatt Macy ssize_t off = 0; 312*eda14cbcSMatt Macy int i; 313*eda14cbcSMatt Macy 314*eda14cbcSMatt Macy ASSERT3U(size, >=, RAIDZ_KSTAT_LINE_LEN); 315*eda14cbcSMatt Macy 316*eda14cbcSMatt Macy if (cstat == fstat) { 317*eda14cbcSMatt Macy off += snprintf(buf + off, size - off, "%-17s", "fastest"); 318*eda14cbcSMatt Macy 319*eda14cbcSMatt Macy for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++) { 320*eda14cbcSMatt Macy int id = fstat->gen[i]; 321*eda14cbcSMatt Macy off += snprintf(buf + off, size - off, "%-16s", 322*eda14cbcSMatt Macy raidz_supp_impl[id]->name); 323*eda14cbcSMatt Macy } 324*eda14cbcSMatt Macy for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++) { 325*eda14cbcSMatt Macy int id = fstat->rec[i]; 326*eda14cbcSMatt Macy off += snprintf(buf + off, size - off, "%-16s", 327*eda14cbcSMatt Macy raidz_supp_impl[id]->name); 328*eda14cbcSMatt Macy } 329*eda14cbcSMatt Macy } else { 330*eda14cbcSMatt Macy ptrdiff_t id = cstat - raidz_impl_kstats; 331*eda14cbcSMatt Macy 332*eda14cbcSMatt Macy off += snprintf(buf + off, size - off, "%-17s", 333*eda14cbcSMatt Macy raidz_supp_impl[id]->name); 334*eda14cbcSMatt Macy 335*eda14cbcSMatt Macy for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++) 336*eda14cbcSMatt Macy off += snprintf(buf + off, size - off, "%-16llu", 337*eda14cbcSMatt Macy (u_longlong_t)cstat->gen[i]); 338*eda14cbcSMatt Macy 339*eda14cbcSMatt Macy for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++) 340*eda14cbcSMatt Macy off += snprintf(buf + off, size - off, "%-16llu", 341*eda14cbcSMatt Macy (u_longlong_t)cstat->rec[i]); 342*eda14cbcSMatt Macy } 343*eda14cbcSMatt Macy 344*eda14cbcSMatt Macy (void) snprintf(buf + off, size - off, "\n"); 345*eda14cbcSMatt Macy 346*eda14cbcSMatt Macy return (0); 347*eda14cbcSMatt Macy } 348*eda14cbcSMatt Macy 349*eda14cbcSMatt Macy static void * 350*eda14cbcSMatt Macy raidz_math_kstat_addr(kstat_t *ksp, loff_t n) 351*eda14cbcSMatt Macy { 352*eda14cbcSMatt Macy if (n <= raidz_supp_impl_cnt) 353*eda14cbcSMatt Macy ksp->ks_private = (void *) (raidz_impl_kstats + n); 354*eda14cbcSMatt Macy else 355*eda14cbcSMatt Macy ksp->ks_private = NULL; 356*eda14cbcSMatt Macy 357*eda14cbcSMatt Macy return (ksp->ks_private); 358*eda14cbcSMatt Macy } 359*eda14cbcSMatt Macy 360*eda14cbcSMatt Macy #define BENCH_D_COLS (8ULL) 361*eda14cbcSMatt Macy #define BENCH_COLS (BENCH_D_COLS + PARITY_PQR) 362*eda14cbcSMatt Macy #define BENCH_ZIO_SIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) /* 128 kiB */ 363*eda14cbcSMatt Macy #define BENCH_NS MSEC2NSEC(25) /* 25ms */ 364*eda14cbcSMatt Macy 365*eda14cbcSMatt Macy typedef void (*benchmark_fn)(raidz_map_t *rm, const int fn); 366*eda14cbcSMatt Macy 367*eda14cbcSMatt Macy static void 368*eda14cbcSMatt Macy benchmark_gen_impl(raidz_map_t *rm, const int fn) 369*eda14cbcSMatt Macy { 370*eda14cbcSMatt Macy (void) fn; 371*eda14cbcSMatt Macy vdev_raidz_generate_parity(rm); 372*eda14cbcSMatt Macy } 373*eda14cbcSMatt Macy 374*eda14cbcSMatt Macy static void 375*eda14cbcSMatt Macy benchmark_rec_impl(raidz_map_t *rm, const int fn) 376*eda14cbcSMatt Macy { 377*eda14cbcSMatt Macy static const int rec_tgt[7][3] = { 378*eda14cbcSMatt Macy {1, 2, 3}, /* rec_p: bad QR & D[0] */ 379*eda14cbcSMatt Macy {0, 2, 3}, /* rec_q: bad PR & D[0] */ 380*eda14cbcSMatt Macy {0, 1, 3}, /* rec_r: bad PQ & D[0] */ 381*eda14cbcSMatt Macy {2, 3, 4}, /* rec_pq: bad R & D[0][1] */ 382*eda14cbcSMatt Macy {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */ 383*eda14cbcSMatt Macy {0, 3, 4}, /* rec_qr: bad P & D[0][1] */ 384*eda14cbcSMatt Macy {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */ 385*eda14cbcSMatt Macy }; 386*eda14cbcSMatt Macy 387*eda14cbcSMatt Macy vdev_raidz_reconstruct(rm, rec_tgt[fn], 3); 388*eda14cbcSMatt Macy } 389*eda14cbcSMatt Macy 390*eda14cbcSMatt Macy /* 391*eda14cbcSMatt Macy * Benchmarking of all supported implementations (raidz_supp_impl_cnt) 392*eda14cbcSMatt Macy * is performed by setting the rm_ops pointer and calling the top level 393*eda14cbcSMatt Macy * generate/reconstruct methods of bench_rm. 394*eda14cbcSMatt Macy */ 395*eda14cbcSMatt Macy static void 396*eda14cbcSMatt Macy benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn) 397*eda14cbcSMatt Macy { 398*eda14cbcSMatt Macy uint64_t run_cnt, speed, best_speed = 0; 399*eda14cbcSMatt Macy hrtime_t t_start, t_diff; 400*eda14cbcSMatt Macy raidz_impl_ops_t *curr_impl; 401*eda14cbcSMatt Macy raidz_impl_kstat_t *fstat = &raidz_impl_kstats[raidz_supp_impl_cnt]; 402*eda14cbcSMatt Macy int impl, i; 403*eda14cbcSMatt Macy 404*eda14cbcSMatt Macy for (impl = 0; impl < raidz_supp_impl_cnt; impl++) { 405*eda14cbcSMatt Macy /* set an implementation to benchmark */ 406*eda14cbcSMatt Macy curr_impl = raidz_supp_impl[impl]; 407*eda14cbcSMatt Macy bench_rm->rm_ops = curr_impl; 408*eda14cbcSMatt Macy 409*eda14cbcSMatt Macy run_cnt = 0; 410*eda14cbcSMatt Macy t_start = gethrtime(); 411*eda14cbcSMatt Macy 412*eda14cbcSMatt Macy do { 413*eda14cbcSMatt Macy for (i = 0; i < 25; i++, run_cnt++) 414*eda14cbcSMatt Macy bench_fn(bench_rm, fn); 415*eda14cbcSMatt Macy 416*eda14cbcSMatt Macy t_diff = gethrtime() - t_start; 417*eda14cbcSMatt Macy } while (t_diff < BENCH_NS); 418*eda14cbcSMatt Macy 419*eda14cbcSMatt Macy speed = run_cnt * BENCH_ZIO_SIZE * NANOSEC; 420*eda14cbcSMatt Macy speed /= (t_diff * BENCH_COLS); 421*eda14cbcSMatt Macy 422*eda14cbcSMatt Macy if (bench_fn == benchmark_gen_impl) 423*eda14cbcSMatt Macy raidz_impl_kstats[impl].gen[fn] = speed; 424*eda14cbcSMatt Macy else 425*eda14cbcSMatt Macy raidz_impl_kstats[impl].rec[fn] = speed; 426*eda14cbcSMatt Macy 427*eda14cbcSMatt Macy /* Update fastest implementation method */ 428*eda14cbcSMatt Macy if (speed > best_speed) { 429*eda14cbcSMatt Macy best_speed = speed; 430*eda14cbcSMatt Macy 431*eda14cbcSMatt Macy if (bench_fn == benchmark_gen_impl) { 432*eda14cbcSMatt Macy fstat->gen[fn] = impl; 433*eda14cbcSMatt Macy vdev_raidz_fastest_impl.gen[fn] = 434*eda14cbcSMatt Macy curr_impl->gen[fn]; 435*eda14cbcSMatt Macy } else { 436*eda14cbcSMatt Macy fstat->rec[fn] = impl; 437*eda14cbcSMatt Macy vdev_raidz_fastest_impl.rec[fn] = 438*eda14cbcSMatt Macy curr_impl->rec[fn]; 439*eda14cbcSMatt Macy } 440*eda14cbcSMatt Macy } 441*eda14cbcSMatt Macy } 442*eda14cbcSMatt Macy } 443*eda14cbcSMatt Macy #endif 444*eda14cbcSMatt Macy 445*eda14cbcSMatt Macy /* 446*eda14cbcSMatt Macy * Initialize and benchmark all supported implementations. 447*eda14cbcSMatt Macy */ 448*eda14cbcSMatt Macy static void 449*eda14cbcSMatt Macy benchmark_raidz(void) 450*eda14cbcSMatt Macy { 451*eda14cbcSMatt Macy raidz_impl_ops_t *curr_impl; 452*eda14cbcSMatt Macy int i, c; 453*eda14cbcSMatt Macy 454*eda14cbcSMatt Macy /* Move supported impl into raidz_supp_impl */ 455*eda14cbcSMatt Macy for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) { 456*eda14cbcSMatt Macy curr_impl = (raidz_impl_ops_t *)raidz_all_maths[i]; 457*eda14cbcSMatt Macy 458*eda14cbcSMatt Macy if (curr_impl->init) 459*eda14cbcSMatt Macy curr_impl->init(); 460*eda14cbcSMatt Macy 461*eda14cbcSMatt Macy if (curr_impl->is_supported()) 462*eda14cbcSMatt Macy raidz_supp_impl[c++] = (raidz_impl_ops_t *)curr_impl; 463*eda14cbcSMatt Macy } 464*eda14cbcSMatt Macy membar_producer(); /* complete raidz_supp_impl[] init */ 465*eda14cbcSMatt Macy raidz_supp_impl_cnt = c; /* number of supported impl */ 466*eda14cbcSMatt Macy 467*eda14cbcSMatt Macy #if defined(_KERNEL) 468*eda14cbcSMatt Macy zio_t *bench_zio = NULL; 469*eda14cbcSMatt Macy raidz_map_t *bench_rm = NULL; 470*eda14cbcSMatt Macy uint64_t bench_parity; 471*eda14cbcSMatt Macy 472*eda14cbcSMatt Macy /* Fake a zio and run the benchmark on a warmed up buffer */ 473*eda14cbcSMatt Macy bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP); 474*eda14cbcSMatt Macy bench_zio->io_offset = 0; 475*eda14cbcSMatt Macy bench_zio->io_size = BENCH_ZIO_SIZE; /* only data columns */ 476*eda14cbcSMatt Macy bench_zio->io_abd = abd_alloc_linear(BENCH_ZIO_SIZE, B_TRUE); 477*eda14cbcSMatt Macy memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE); 478*eda14cbcSMatt Macy 479*eda14cbcSMatt Macy /* Benchmark parity generation methods */ 480*eda14cbcSMatt Macy for (int fn = 0; fn < RAIDZ_GEN_NUM; fn++) { 481*eda14cbcSMatt Macy bench_parity = fn + 1; 482*eda14cbcSMatt Macy /* New raidz_map is needed for each generate_p/q/r */ 483*eda14cbcSMatt Macy bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT, 484*eda14cbcSMatt Macy BENCH_D_COLS + bench_parity, bench_parity); 485*eda14cbcSMatt Macy 486*eda14cbcSMatt Macy benchmark_raidz_impl(bench_rm, fn, benchmark_gen_impl); 487*eda14cbcSMatt Macy 488*eda14cbcSMatt Macy vdev_raidz_map_free(bench_rm); 489*eda14cbcSMatt Macy } 490*eda14cbcSMatt Macy 491*eda14cbcSMatt Macy /* Benchmark data reconstruction methods */ 492*eda14cbcSMatt Macy bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT, 493*eda14cbcSMatt Macy BENCH_COLS, PARITY_PQR); 494*eda14cbcSMatt Macy 495*eda14cbcSMatt Macy for (int fn = 0; fn < RAIDZ_REC_NUM; fn++) 496*eda14cbcSMatt Macy benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl); 497*eda14cbcSMatt Macy 498*eda14cbcSMatt Macy vdev_raidz_map_free(bench_rm); 499*eda14cbcSMatt Macy 500*eda14cbcSMatt Macy /* cleanup the bench zio */ 501*eda14cbcSMatt Macy abd_free(bench_zio->io_abd); 502*eda14cbcSMatt Macy kmem_free(bench_zio, sizeof (zio_t)); 503*eda14cbcSMatt Macy #else 504*eda14cbcSMatt Macy /* 505*eda14cbcSMatt Macy * Skip the benchmark in user space to avoid impacting libzpool 506*eda14cbcSMatt Macy * consumers (zdb, zhack, zinject, ztest). The last implementation 507*eda14cbcSMatt Macy * is assumed to be the fastest and used by default. 508*eda14cbcSMatt Macy */ 509*eda14cbcSMatt Macy memcpy(&vdev_raidz_fastest_impl, 510*eda14cbcSMatt Macy raidz_supp_impl[raidz_supp_impl_cnt - 1], 511*eda14cbcSMatt Macy sizeof (vdev_raidz_fastest_impl)); 512*eda14cbcSMatt Macy strcpy(vdev_raidz_fastest_impl.name, "fastest"); 513*eda14cbcSMatt Macy #endif /* _KERNEL */ 514*eda14cbcSMatt Macy } 515*eda14cbcSMatt Macy 516*eda14cbcSMatt Macy void 517*eda14cbcSMatt Macy vdev_raidz_math_init(void) 518*eda14cbcSMatt Macy { 519*eda14cbcSMatt Macy /* Determine the fastest available implementation. */ 520*eda14cbcSMatt Macy benchmark_raidz(); 521*eda14cbcSMatt Macy 522*eda14cbcSMatt Macy #if defined(_KERNEL) 523*eda14cbcSMatt Macy /* Install kstats for all implementations */ 524*eda14cbcSMatt Macy raidz_math_kstat = kstat_create("zfs", 0, "vdev_raidz_bench", "misc", 525*eda14cbcSMatt Macy KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); 526*eda14cbcSMatt Macy if (raidz_math_kstat != NULL) { 527*eda14cbcSMatt Macy raidz_math_kstat->ks_data = NULL; 528*eda14cbcSMatt Macy raidz_math_kstat->ks_ndata = UINT32_MAX; 529*eda14cbcSMatt Macy kstat_set_raw_ops(raidz_math_kstat, 530*eda14cbcSMatt Macy raidz_math_kstat_headers, 531*eda14cbcSMatt Macy raidz_math_kstat_data, 532*eda14cbcSMatt Macy raidz_math_kstat_addr); 533*eda14cbcSMatt Macy kstat_install(raidz_math_kstat); 534*eda14cbcSMatt Macy } 535*eda14cbcSMatt Macy #endif 536*eda14cbcSMatt Macy 537*eda14cbcSMatt Macy /* Finish initialization */ 538*eda14cbcSMatt Macy atomic_swap_32(&zfs_vdev_raidz_impl, user_sel_impl); 539*eda14cbcSMatt Macy raidz_math_initialized = B_TRUE; 540*eda14cbcSMatt Macy } 541*eda14cbcSMatt Macy 542*eda14cbcSMatt Macy void 543*eda14cbcSMatt Macy vdev_raidz_math_fini(void) 544*eda14cbcSMatt Macy { 545*eda14cbcSMatt Macy raidz_impl_ops_t const *curr_impl; 546*eda14cbcSMatt Macy 547*eda14cbcSMatt Macy #if defined(_KERNEL) 548*eda14cbcSMatt Macy if (raidz_math_kstat != NULL) { 549*eda14cbcSMatt Macy kstat_delete(raidz_math_kstat); 550*eda14cbcSMatt Macy raidz_math_kstat = NULL; 551*eda14cbcSMatt Macy } 552*eda14cbcSMatt Macy #endif 553*eda14cbcSMatt Macy 554*eda14cbcSMatt Macy for (int i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) { 555*eda14cbcSMatt Macy curr_impl = raidz_all_maths[i]; 556*eda14cbcSMatt Macy if (curr_impl->fini) 557*eda14cbcSMatt Macy curr_impl->fini(); 558*eda14cbcSMatt Macy } 559*eda14cbcSMatt Macy } 560*eda14cbcSMatt Macy 561*eda14cbcSMatt Macy static const struct { 562*eda14cbcSMatt Macy char *name; 563*eda14cbcSMatt Macy uint32_t sel; 564*eda14cbcSMatt Macy } math_impl_opts[] = { 565*eda14cbcSMatt Macy { "cycle", IMPL_CYCLE }, 566*eda14cbcSMatt Macy { "fastest", IMPL_FASTEST }, 567*eda14cbcSMatt Macy { "original", IMPL_ORIGINAL }, 568*eda14cbcSMatt Macy { "scalar", IMPL_SCALAR } 569*eda14cbcSMatt Macy }; 570*eda14cbcSMatt Macy 571*eda14cbcSMatt Macy /* 572*eda14cbcSMatt Macy * Function sets desired raidz implementation. 573*eda14cbcSMatt Macy * 574*eda14cbcSMatt Macy * If we are called before init(), user preference will be saved in 575*eda14cbcSMatt Macy * user_sel_impl, and applied in later init() call. This occurs when module 576*eda14cbcSMatt Macy * parameter is specified on module load. Otherwise, directly update 577*eda14cbcSMatt Macy * zfs_vdev_raidz_impl. 578*eda14cbcSMatt Macy * 579*eda14cbcSMatt Macy * @val Name of raidz implementation to use 580*eda14cbcSMatt Macy * @param Unused. 581*eda14cbcSMatt Macy */ 582*eda14cbcSMatt Macy int 583*eda14cbcSMatt Macy vdev_raidz_impl_set(const char *val) 584*eda14cbcSMatt Macy { 585*eda14cbcSMatt Macy int err = -EINVAL; 586*eda14cbcSMatt Macy char req_name[RAIDZ_IMPL_NAME_MAX]; 587*eda14cbcSMatt Macy uint32_t impl = RAIDZ_IMPL_READ(user_sel_impl); 588*eda14cbcSMatt Macy size_t i; 589*eda14cbcSMatt Macy 590*eda14cbcSMatt Macy /* sanitize input */ 591*eda14cbcSMatt Macy i = strnlen(val, RAIDZ_IMPL_NAME_MAX); 592*eda14cbcSMatt Macy if (i == 0 || i == RAIDZ_IMPL_NAME_MAX) 593*eda14cbcSMatt Macy return (err); 594*eda14cbcSMatt Macy 595*eda14cbcSMatt Macy strlcpy(req_name, val, RAIDZ_IMPL_NAME_MAX); 596*eda14cbcSMatt Macy while (i > 0 && !!isspace(req_name[i-1])) 597*eda14cbcSMatt Macy i--; 598*eda14cbcSMatt Macy req_name[i] = '\0'; 599*eda14cbcSMatt Macy 600*eda14cbcSMatt Macy /* Check mandatory options */ 601*eda14cbcSMatt Macy for (i = 0; i < ARRAY_SIZE(math_impl_opts); i++) { 602*eda14cbcSMatt Macy if (strcmp(req_name, math_impl_opts[i].name) == 0) { 603*eda14cbcSMatt Macy impl = math_impl_opts[i].sel; 604*eda14cbcSMatt Macy err = 0; 605*eda14cbcSMatt Macy break; 606*eda14cbcSMatt Macy } 607*eda14cbcSMatt Macy } 608*eda14cbcSMatt Macy 609*eda14cbcSMatt Macy /* check all supported impl if init() was already called */ 610*eda14cbcSMatt Macy if (err != 0 && raidz_math_initialized) { 611*eda14cbcSMatt Macy /* check all supported implementations */ 612*eda14cbcSMatt Macy for (i = 0; i < raidz_supp_impl_cnt; i++) { 613*eda14cbcSMatt Macy if (strcmp(req_name, raidz_supp_impl[i]->name) == 0) { 614*eda14cbcSMatt Macy impl = i; 615*eda14cbcSMatt Macy err = 0; 616*eda14cbcSMatt Macy break; 617*eda14cbcSMatt Macy } 618*eda14cbcSMatt Macy } 619*eda14cbcSMatt Macy } 620*eda14cbcSMatt Macy 621*eda14cbcSMatt Macy if (err == 0) { 622*eda14cbcSMatt Macy if (raidz_math_initialized) 623*eda14cbcSMatt Macy atomic_swap_32(&zfs_vdev_raidz_impl, impl); 624*eda14cbcSMatt Macy else 625*eda14cbcSMatt Macy atomic_swap_32(&user_sel_impl, impl); 626*eda14cbcSMatt Macy } 627*eda14cbcSMatt Macy 628*eda14cbcSMatt Macy return (err); 629*eda14cbcSMatt Macy } 630*eda14cbcSMatt Macy 631*eda14cbcSMatt Macy #if defined(_KERNEL) && defined(__linux__) 632*eda14cbcSMatt Macy 633*eda14cbcSMatt Macy static int 634*eda14cbcSMatt Macy zfs_vdev_raidz_impl_set(const char *val, zfs_kernel_param_t *kp) 635*eda14cbcSMatt Macy { 636*eda14cbcSMatt Macy return (vdev_raidz_impl_set(val)); 637*eda14cbcSMatt Macy } 638*eda14cbcSMatt Macy 639*eda14cbcSMatt Macy static int 640*eda14cbcSMatt Macy zfs_vdev_raidz_impl_get(char *buffer, zfs_kernel_param_t *kp) 641*eda14cbcSMatt Macy { 642*eda14cbcSMatt Macy int i, cnt = 0; 643*eda14cbcSMatt Macy char *fmt; 644*eda14cbcSMatt Macy const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl); 645*eda14cbcSMatt Macy 646*eda14cbcSMatt Macy ASSERT(raidz_math_initialized); 647*eda14cbcSMatt Macy 648*eda14cbcSMatt Macy /* list mandatory options */ 649*eda14cbcSMatt Macy for (i = 0; i < ARRAY_SIZE(math_impl_opts) - 2; i++) { 650*eda14cbcSMatt Macy fmt = (impl == math_impl_opts[i].sel) ? "[%s] " : "%s "; 651*eda14cbcSMatt Macy cnt += sprintf(buffer + cnt, fmt, math_impl_opts[i].name); 652*eda14cbcSMatt Macy } 653*eda14cbcSMatt Macy 654*eda14cbcSMatt Macy /* list all supported implementations */ 655*eda14cbcSMatt Macy for (i = 0; i < raidz_supp_impl_cnt; i++) { 656*eda14cbcSMatt Macy fmt = (i == impl) ? "[%s] " : "%s "; 657*eda14cbcSMatt Macy cnt += sprintf(buffer + cnt, fmt, raidz_supp_impl[i]->name); 658*eda14cbcSMatt Macy } 659*eda14cbcSMatt Macy 660*eda14cbcSMatt Macy return (cnt); 661*eda14cbcSMatt Macy } 662*eda14cbcSMatt Macy 663*eda14cbcSMatt Macy module_param_call(zfs_vdev_raidz_impl, zfs_vdev_raidz_impl_set, 664*eda14cbcSMatt Macy zfs_vdev_raidz_impl_get, NULL, 0644); 665*eda14cbcSMatt Macy MODULE_PARM_DESC(zfs_vdev_raidz_impl, "Select raidz implementation."); 666*eda14cbcSMatt Macy #endif 667