1495db6fbSLori Alt /*
2495db6fbSLori Alt * CDDL HEADER START
3495db6fbSLori Alt *
4495db6fbSLori Alt * The contents of this file are subject to the terms of the
5495db6fbSLori Alt * Common Development and Distribution License (the "License").
6495db6fbSLori Alt * You may not use this file except in compliance with the License.
7495db6fbSLori Alt *
8495db6fbSLori Alt * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9495db6fbSLori Alt * or http://www.opensolaris.org/os/licensing.
10495db6fbSLori Alt * See the License for the specific language governing permissions
11495db6fbSLori Alt * and limitations under the License.
12495db6fbSLori Alt *
13495db6fbSLori Alt * When distributing Covered Code, include this CDDL HEADER in each
14495db6fbSLori Alt * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15495db6fbSLori Alt * If applicable, add the following below this CDDL HEADER, with the
16495db6fbSLori Alt * fields enclosed by brackets "[]" replaced with your own identifying
17495db6fbSLori Alt * information: Portions Copyright [yyyy] [name of copyright owner]
18495db6fbSLori Alt *
19495db6fbSLori Alt * CDDL HEADER END
20495db6fbSLori Alt */
21495db6fbSLori Alt /*
22495db6fbSLori Alt * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23495db6fbSLori Alt * Use is subject to license terms.
24495db6fbSLori Alt */
2545818ee1SMatthew Ahrens /*
2645818ee1SMatthew Ahrens * Copyright 2013 Saso Kiselkov. All rights reserved.
27770499e1SDan Kimmel * Copyright (c) 2016 by Delphix. All rights reserved.
28*0886dcadSAndy Fiddaman * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
29*0886dcadSAndy Fiddaman * Copyright 2024 Oxide Computer Company
3045818ee1SMatthew Ahrens */
31495db6fbSLori Alt
32495db6fbSLori Alt /*
33495db6fbSLori Alt * Fletcher Checksums
34495db6fbSLori Alt * ------------------
35495db6fbSLori Alt *
36495db6fbSLori Alt * ZFS's 2nd and 4th order Fletcher checksums are defined by the following
37495db6fbSLori Alt * recurrence relations:
38495db6fbSLori Alt *
39495db6fbSLori Alt * a = a + f
40495db6fbSLori Alt * i i-1 i-1
41495db6fbSLori Alt *
42495db6fbSLori Alt * b = b + a
43495db6fbSLori Alt * i i-1 i
44495db6fbSLori Alt *
45495db6fbSLori Alt * c = c + b (fletcher-4 only)
46495db6fbSLori Alt * i i-1 i
47495db6fbSLori Alt *
48495db6fbSLori Alt * d = d + c (fletcher-4 only)
49495db6fbSLori Alt * i i-1 i
50495db6fbSLori Alt *
51495db6fbSLori Alt * Where
52495db6fbSLori Alt * a_0 = b_0 = c_0 = d_0 = 0
53495db6fbSLori Alt * and
54495db6fbSLori Alt * f_0 .. f_(n-1) are the input data.
55495db6fbSLori Alt *
56495db6fbSLori Alt * Using standard techniques, these translate into the following series:
57495db6fbSLori Alt *
58495db6fbSLori Alt * __n_ __n_
59495db6fbSLori Alt * \ | \ |
60495db6fbSLori Alt * a = > f b = > i * f
61495db6fbSLori Alt * n /___| n - i n /___| n - i
62495db6fbSLori Alt * i = 1 i = 1
63495db6fbSLori Alt *
64495db6fbSLori Alt *
65495db6fbSLori Alt * __n_ __n_
66495db6fbSLori Alt * \ | i*(i+1) \ | i*(i+1)*(i+2)
67495db6fbSLori Alt * c = > ------- f d = > ------------- f
68495db6fbSLori Alt * n /___| 2 n - i n /___| 6 n - i
69495db6fbSLori Alt * i = 1 i = 1
70495db6fbSLori Alt *
71495db6fbSLori Alt * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators.
72495db6fbSLori Alt * Since the additions are done mod (2^64), errors in the high bits may not
73495db6fbSLori Alt * be noticed. For this reason, fletcher-2 is deprecated.
74495db6fbSLori Alt *
75495db6fbSLori Alt * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators.
76495db6fbSLori Alt * A conservative estimate of how big the buffer can get before we overflow
77495db6fbSLori Alt * can be estimated using f_i = 0xffffffff for all i:
78495db6fbSLori Alt *
79495db6fbSLori Alt * % bc
80495db6fbSLori Alt * f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4
81495db6fbSLori Alt * 2264
82495db6fbSLori Alt * quit
83495db6fbSLori Alt * %
84495db6fbSLori Alt *
85495db6fbSLori Alt * So blocks of up to 2k will not overflow. Our largest block size is
86495db6fbSLori Alt * 128k, which has 32k 4-byte words, so we can compute the largest possible
87495db6fbSLori Alt * accumulators, then divide by 2^64 to figure the max amount of overflow:
88495db6fbSLori Alt *
89495db6fbSLori Alt * % bc
90495db6fbSLori Alt * a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c }
91495db6fbSLori Alt * a/2^64;b/2^64;c/2^64;d/2^64
92495db6fbSLori Alt * 0
93495db6fbSLori Alt * 0
94495db6fbSLori Alt * 1365
95495db6fbSLori Alt * 11186858
96495db6fbSLori Alt * quit
97495db6fbSLori Alt * %
98495db6fbSLori Alt *
99495db6fbSLori Alt * So a and b cannot overflow. To make sure each bit of input has some
100495db6fbSLori Alt * effect on the contents of c and d, we can look at what the factors of
101495db6fbSLori Alt * the coefficients in the equations for c_n and d_n are. The number of 2s
102495db6fbSLori Alt * in the factors determines the lowest set bit in the multiplier. Running
103495db6fbSLori Alt * through the cases for n*(n+1)/2 reveals that the highest power of 2 is
104495db6fbSLori Alt * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15. So while some data may overflow
105495db6fbSLori Alt * the 64-bit accumulators, every bit of every f_i effects every accumulator,
106495db6fbSLori Alt * even for 128k blocks.
107495db6fbSLori Alt *
108495db6fbSLori Alt * If we wanted to make a stronger version of fletcher4 (fletcher4c?),
109495db6fbSLori Alt * we could do our calculations mod (2^32 - 1) by adding in the carries
110495db6fbSLori Alt * periodically, and store the number of carries in the top 32-bits.
111495db6fbSLori Alt *
112495db6fbSLori Alt * --------------------
113495db6fbSLori Alt * Checksum Performance
114495db6fbSLori Alt * --------------------
115495db6fbSLori Alt *
116495db6fbSLori Alt * There are two interesting components to checksum performance: cached and
117495db6fbSLori Alt * uncached performance. With cached data, fletcher-2 is about four times
118495db6fbSLori Alt * faster than fletcher-4. With uncached data, the performance difference is
119495db6fbSLori Alt * negligible, since the cost of a cache fill dominates the processing time.
120495db6fbSLori Alt * Even though fletcher-4 is slower than fletcher-2, it is still a pretty
121495db6fbSLori Alt * efficient pass over the data.
122495db6fbSLori Alt *
123495db6fbSLori Alt * In normal operation, the data which is being checksummed is in a buffer
124495db6fbSLori Alt * which has been filled either by:
125495db6fbSLori Alt *
126495db6fbSLori Alt * 1. a compression step, which will be mostly cached, or
127495db6fbSLori Alt * 2. a bcopy() or copyin(), which will be uncached (because the
128495db6fbSLori Alt * copy is cache-bypassing).
129495db6fbSLori Alt *
130495db6fbSLori Alt * For both cached and uncached data, both fletcher checksums are much faster
131495db6fbSLori Alt * than sha-256, and slower than 'off', which doesn't touch the data at all.
132495db6fbSLori Alt */
133495db6fbSLori Alt
134495db6fbSLori Alt #include <sys/types.h>
135495db6fbSLori Alt #include <sys/sysmacros.h>
136495db6fbSLori Alt #include <sys/byteorder.h>
137*0886dcadSAndy Fiddaman #include <sys/simd.h>
138495db6fbSLori Alt #include <sys/spa.h>
139*0886dcadSAndy Fiddaman #include <sys/zio_checksum.h>
140*0886dcadSAndy Fiddaman #include <sys/zfs_context.h>
141770499e1SDan Kimmel #include <zfs_fletcher.h>
142495db6fbSLori Alt
143*0886dcadSAndy Fiddaman #define FLETCHER_MIN_SIMD_SIZE 64
144*0886dcadSAndy Fiddaman
145*0886dcadSAndy Fiddaman #ifdef _KERNEL
146*0886dcadSAndy Fiddaman
147*0886dcadSAndy Fiddaman #include <sys/atomic.h>
148*0886dcadSAndy Fiddaman #include <sys/disp.h>
149*0886dcadSAndy Fiddaman #define KPREEMPT_DISABLE kpreempt_disable()
150*0886dcadSAndy Fiddaman #define KPREEMPT_ENABLE kpreempt_enable()
151*0886dcadSAndy Fiddaman #define MEMBAR_PRODUCER membar_producer()
152*0886dcadSAndy Fiddaman
153*0886dcadSAndy Fiddaman #else /* _KERNEL */
154*0886dcadSAndy Fiddaman
155*0886dcadSAndy Fiddaman #include <atomic.h>
156*0886dcadSAndy Fiddaman #include <string.h>
157*0886dcadSAndy Fiddaman #ifndef SET_ERROR
158*0886dcadSAndy Fiddaman #define SET_ERROR(err) (err)
159*0886dcadSAndy Fiddaman #endif
160*0886dcadSAndy Fiddaman #define KPREEMPT_DISABLE
161*0886dcadSAndy Fiddaman #define KPREEMPT_ENABLE
162*0886dcadSAndy Fiddaman #define MEMBAR_PRODUCER
163*0886dcadSAndy Fiddaman
164*0886dcadSAndy Fiddaman #endif /* _KERNEL */
165*0886dcadSAndy Fiddaman
166*0886dcadSAndy Fiddaman static void fletcher_4_scalar_init(fletcher_4_ctx_t *ctx);
167*0886dcadSAndy Fiddaman static void fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp);
168*0886dcadSAndy Fiddaman static void fletcher_4_scalar_native(fletcher_4_ctx_t *ctx,
169*0886dcadSAndy Fiddaman const void *buf, size_t size);
170*0886dcadSAndy Fiddaman static void fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx,
171*0886dcadSAndy Fiddaman const void *buf, size_t size);
172*0886dcadSAndy Fiddaman static boolean_t fletcher_4_scalar_valid(void);
173*0886dcadSAndy Fiddaman
174*0886dcadSAndy Fiddaman static const fletcher_4_ops_t fletcher_4_scalar_ops = {
175*0886dcadSAndy Fiddaman .init_native = fletcher_4_scalar_init,
176*0886dcadSAndy Fiddaman .fini_native = fletcher_4_scalar_fini,
177*0886dcadSAndy Fiddaman .compute_native = fletcher_4_scalar_native,
178*0886dcadSAndy Fiddaman .init_byteswap = fletcher_4_scalar_init,
179*0886dcadSAndy Fiddaman .fini_byteswap = fletcher_4_scalar_fini,
180*0886dcadSAndy Fiddaman .compute_byteswap = fletcher_4_scalar_byteswap,
181*0886dcadSAndy Fiddaman .valid = fletcher_4_scalar_valid,
182*0886dcadSAndy Fiddaman .uses_fpu_native = B_FALSE,
183*0886dcadSAndy Fiddaman .uses_fpu_byteswap = B_FALSE,
184*0886dcadSAndy Fiddaman .name = "scalar"
185*0886dcadSAndy Fiddaman };
186*0886dcadSAndy Fiddaman
187*0886dcadSAndy Fiddaman static fletcher_4_ops_t fletcher_4_fastest_impl = {
188*0886dcadSAndy Fiddaman .name = "fastest",
189*0886dcadSAndy Fiddaman .valid = fletcher_4_scalar_valid
190*0886dcadSAndy Fiddaman };
191*0886dcadSAndy Fiddaman
192*0886dcadSAndy Fiddaman static const fletcher_4_ops_t *fletcher_4_impls[] = {
193*0886dcadSAndy Fiddaman &fletcher_4_scalar_ops,
194*0886dcadSAndy Fiddaman &fletcher_4_superscalar_ops,
195*0886dcadSAndy Fiddaman &fletcher_4_superscalar4_ops,
196*0886dcadSAndy Fiddaman #ifdef __amd64
197*0886dcadSAndy Fiddaman &fletcher_4_sse2_ops,
198*0886dcadSAndy Fiddaman &fletcher_4_ssse3_ops,
199*0886dcadSAndy Fiddaman &fletcher_4_avx2_ops,
200*0886dcadSAndy Fiddaman &fletcher_4_avx512f_ops,
201*0886dcadSAndy Fiddaman &fletcher_4_avx512bw_ops,
202*0886dcadSAndy Fiddaman #endif
203*0886dcadSAndy Fiddaman };
204*0886dcadSAndy Fiddaman
205*0886dcadSAndy Fiddaman /* Hold all supported implementations */
206*0886dcadSAndy Fiddaman static uint32_t fletcher_4_supp_impls_cnt = 0;
207*0886dcadSAndy Fiddaman static fletcher_4_ops_t *fletcher_4_supp_impls[ARRAY_SIZE(fletcher_4_impls)];
208*0886dcadSAndy Fiddaman
209*0886dcadSAndy Fiddaman /* Select fletcher4 implementation */
210*0886dcadSAndy Fiddaman #define IMPL_FASTEST (UINT32_MAX)
211*0886dcadSAndy Fiddaman #define IMPL_CYCLE (UINT32_MAX - 1)
212*0886dcadSAndy Fiddaman #define IMPL_SCALAR (0)
213*0886dcadSAndy Fiddaman #define IMPL_SUPERSCALAR (1)
214*0886dcadSAndy Fiddaman #define IMPL_SUPERSCALAR4 (2)
215*0886dcadSAndy Fiddaman
216*0886dcadSAndy Fiddaman static uint32_t fletcher_4_impl_chosen = IMPL_FASTEST;
217*0886dcadSAndy Fiddaman
218*0886dcadSAndy Fiddaman #define IMPL_READ(i) (*(volatile uint32_t *) &(i))
219*0886dcadSAndy Fiddaman
220*0886dcadSAndy Fiddaman static struct fletcher_4_impl_selector {
221*0886dcadSAndy Fiddaman const char *fis_name;
222*0886dcadSAndy Fiddaman uint32_t fis_sel;
223*0886dcadSAndy Fiddaman } fletcher_4_impl_selectors[] = {
224*0886dcadSAndy Fiddaman { "cycle", IMPL_CYCLE },
225*0886dcadSAndy Fiddaman { "fastest", IMPL_FASTEST },
226*0886dcadSAndy Fiddaman { "scalar", IMPL_SCALAR }
227*0886dcadSAndy Fiddaman };
228*0886dcadSAndy Fiddaman
229*0886dcadSAndy Fiddaman #if defined(_KERNEL)
230*0886dcadSAndy Fiddaman static kstat_t *fletcher_4_kstat;
231*0886dcadSAndy Fiddaman static kstat_named_t fletcher_4_kstat_data[ARRAY_SIZE(fletcher_4_impls) * 2];
232*0886dcadSAndy Fiddaman
233*0886dcadSAndy Fiddaman static struct fletcher_4_bench {
234*0886dcadSAndy Fiddaman uint64_t native;
235*0886dcadSAndy Fiddaman uint64_t byteswap;
236*0886dcadSAndy Fiddaman } fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1];
237*0886dcadSAndy Fiddaman #endif
238*0886dcadSAndy Fiddaman
239*0886dcadSAndy Fiddaman /* Indicate that benchmark has been completed */
240*0886dcadSAndy Fiddaman static boolean_t fletcher_4_initialized = B_FALSE;
241*0886dcadSAndy Fiddaman
242495db6fbSLori Alt void
fletcher_init(zio_cksum_t * zcp)243770499e1SDan Kimmel fletcher_init(zio_cksum_t *zcp)
244495db6fbSLori Alt {
245770499e1SDan Kimmel ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
246770499e1SDan Kimmel }
247770499e1SDan Kimmel
248770499e1SDan Kimmel int
fletcher_2_incremental_native(void * buf,size_t size,void * data)249770499e1SDan Kimmel fletcher_2_incremental_native(void *buf, size_t size, void *data)
250770499e1SDan Kimmel {
251770499e1SDan Kimmel zio_cksum_t *zcp = data;
252770499e1SDan Kimmel
253495db6fbSLori Alt const uint64_t *ip = buf;
254495db6fbSLori Alt const uint64_t *ipend = ip + (size / sizeof (uint64_t));
255495db6fbSLori Alt uint64_t a0, b0, a1, b1;
256495db6fbSLori Alt
257770499e1SDan Kimmel a0 = zcp->zc_word[0];
258770499e1SDan Kimmel a1 = zcp->zc_word[1];
259770499e1SDan Kimmel b0 = zcp->zc_word[2];
260770499e1SDan Kimmel b1 = zcp->zc_word[3];
261770499e1SDan Kimmel
262770499e1SDan Kimmel for (; ip < ipend; ip += 2) {
263495db6fbSLori Alt a0 += ip[0];
264495db6fbSLori Alt a1 += ip[1];
265495db6fbSLori Alt b0 += a0;
266495db6fbSLori Alt b1 += a1;
267495db6fbSLori Alt }
268495db6fbSLori Alt
269495db6fbSLori Alt ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
270770499e1SDan Kimmel return (0);
271495db6fbSLori Alt }
272495db6fbSLori Alt
273495db6fbSLori Alt void
fletcher_2_native(const void * buf,size_t size,const void * ctx_template __unused,zio_cksum_t * zcp)274770499e1SDan Kimmel fletcher_2_native(const void *buf, size_t size,
275*0886dcadSAndy Fiddaman const void *ctx_template __unused, zio_cksum_t *zcp)
276495db6fbSLori Alt {
277770499e1SDan Kimmel fletcher_init(zcp);
278770499e1SDan Kimmel (void) fletcher_2_incremental_native((void *) buf, size, zcp);
279770499e1SDan Kimmel }
280770499e1SDan Kimmel
281770499e1SDan Kimmel int
fletcher_2_incremental_byteswap(void * buf,size_t size,void * data)282770499e1SDan Kimmel fletcher_2_incremental_byteswap(void *buf, size_t size, void *data)
283770499e1SDan Kimmel {
284770499e1SDan Kimmel zio_cksum_t *zcp = data;
285770499e1SDan Kimmel
286495db6fbSLori Alt const uint64_t *ip = buf;
287495db6fbSLori Alt const uint64_t *ipend = ip + (size / sizeof (uint64_t));
288495db6fbSLori Alt uint64_t a0, b0, a1, b1;
289495db6fbSLori Alt
290770499e1SDan Kimmel a0 = zcp->zc_word[0];
291770499e1SDan Kimmel a1 = zcp->zc_word[1];
292770499e1SDan Kimmel b0 = zcp->zc_word[2];
293770499e1SDan Kimmel b1 = zcp->zc_word[3];
294770499e1SDan Kimmel
295770499e1SDan Kimmel for (; ip < ipend; ip += 2) {
296495db6fbSLori Alt a0 += BSWAP_64(ip[0]);
297495db6fbSLori Alt a1 += BSWAP_64(ip[1]);
298495db6fbSLori Alt b0 += a0;
299495db6fbSLori Alt b1 += a1;
300495db6fbSLori Alt }
301495db6fbSLori Alt
302495db6fbSLori Alt ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
303770499e1SDan Kimmel return (0);
304495db6fbSLori Alt }
305495db6fbSLori Alt
306495db6fbSLori Alt void
fletcher_2_byteswap(const void * buf,size_t size,const void * ctx_template __unused,zio_cksum_t * zcp)307770499e1SDan Kimmel fletcher_2_byteswap(const void *buf, size_t size,
308*0886dcadSAndy Fiddaman const void *ctx_template __unused, zio_cksum_t *zcp)
309495db6fbSLori Alt {
310770499e1SDan Kimmel fletcher_init(zcp);
311770499e1SDan Kimmel (void) fletcher_2_incremental_byteswap((void *) buf, size, zcp);
312495db6fbSLori Alt }
313495db6fbSLori Alt
314*0886dcadSAndy Fiddaman static void
fletcher_4_scalar_init(fletcher_4_ctx_t * ctx)315*0886dcadSAndy Fiddaman fletcher_4_scalar_init(fletcher_4_ctx_t *ctx)
3164ee0199eSRobert Mustacchi {
317*0886dcadSAndy Fiddaman ZIO_SET_CHECKSUM(&ctx->scalar, 0, 0, 0, 0);
318*0886dcadSAndy Fiddaman }
3194ee0199eSRobert Mustacchi
320*0886dcadSAndy Fiddaman static void
fletcher_4_scalar_fini(fletcher_4_ctx_t * ctx,zio_cksum_t * zcp)321*0886dcadSAndy Fiddaman fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
322*0886dcadSAndy Fiddaman {
323*0886dcadSAndy Fiddaman memcpy(zcp, &ctx->scalar, sizeof (zio_cksum_t));
324*0886dcadSAndy Fiddaman }
325*0886dcadSAndy Fiddaman
326*0886dcadSAndy Fiddaman static void
fletcher_4_scalar_native(fletcher_4_ctx_t * ctx,const void * buf,size_t size)327*0886dcadSAndy Fiddaman fletcher_4_scalar_native(fletcher_4_ctx_t *ctx, const void *buf, size_t size)
328*0886dcadSAndy Fiddaman {
329495db6fbSLori Alt const uint32_t *ip = buf;
330495db6fbSLori Alt const uint32_t *ipend = ip + (size / sizeof (uint32_t));
331495db6fbSLori Alt uint64_t a, b, c, d;
332495db6fbSLori Alt
333*0886dcadSAndy Fiddaman a = ctx->scalar.zc_word[0];
334*0886dcadSAndy Fiddaman b = ctx->scalar.zc_word[1];
335*0886dcadSAndy Fiddaman c = ctx->scalar.zc_word[2];
336*0886dcadSAndy Fiddaman d = ctx->scalar.zc_word[3];
337495db6fbSLori Alt
338495db6fbSLori Alt for (; ip < ipend; ip++) {
339495db6fbSLori Alt a += ip[0];
340495db6fbSLori Alt b += a;
341495db6fbSLori Alt c += b;
342495db6fbSLori Alt d += c;
343495db6fbSLori Alt }
344495db6fbSLori Alt
345*0886dcadSAndy Fiddaman ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d);
346495db6fbSLori Alt }
347495db6fbSLori Alt
348*0886dcadSAndy Fiddaman static void
fletcher_4_scalar_byteswap(fletcher_4_ctx_t * ctx,const void * buf,size_t size)349*0886dcadSAndy Fiddaman fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx, const void *buf, size_t size)
350495db6fbSLori Alt {
351495db6fbSLori Alt const uint32_t *ip = buf;
352495db6fbSLori Alt const uint32_t *ipend = ip + (size / sizeof (uint32_t));
353495db6fbSLori Alt uint64_t a, b, c, d;
354495db6fbSLori Alt
355*0886dcadSAndy Fiddaman a = ctx->scalar.zc_word[0];
356*0886dcadSAndy Fiddaman b = ctx->scalar.zc_word[1];
357*0886dcadSAndy Fiddaman c = ctx->scalar.zc_word[2];
358*0886dcadSAndy Fiddaman d = ctx->scalar.zc_word[3];
359495db6fbSLori Alt
360495db6fbSLori Alt for (; ip < ipend; ip++) {
361495db6fbSLori Alt a += BSWAP_32(ip[0]);
362495db6fbSLori Alt b += a;
363495db6fbSLori Alt c += b;
364495db6fbSLori Alt d += c;
365495db6fbSLori Alt }
366495db6fbSLori Alt
367*0886dcadSAndy Fiddaman ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d);
368*0886dcadSAndy Fiddaman }
369*0886dcadSAndy Fiddaman
370*0886dcadSAndy Fiddaman static boolean_t
fletcher_4_scalar_valid(void)371*0886dcadSAndy Fiddaman fletcher_4_scalar_valid(void)
372*0886dcadSAndy Fiddaman {
373*0886dcadSAndy Fiddaman return (B_TRUE);
374*0886dcadSAndy Fiddaman }
375*0886dcadSAndy Fiddaman
376*0886dcadSAndy Fiddaman int
fletcher_4_impl_set(const char * val)377*0886dcadSAndy Fiddaman fletcher_4_impl_set(const char *val)
378*0886dcadSAndy Fiddaman {
379*0886dcadSAndy Fiddaman int err = EINVAL;
380*0886dcadSAndy Fiddaman uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
381*0886dcadSAndy Fiddaman size_t i;
382*0886dcadSAndy Fiddaman
383*0886dcadSAndy Fiddaman /* check mandatory implementations */
384*0886dcadSAndy Fiddaman for (i = 0; i < ARRAY_SIZE(fletcher_4_impl_selectors); i++) {
385*0886dcadSAndy Fiddaman const char *name = fletcher_4_impl_selectors[i].fis_name;
386*0886dcadSAndy Fiddaman
387*0886dcadSAndy Fiddaman if (strcmp(val, name) == 0) {
388*0886dcadSAndy Fiddaman impl = fletcher_4_impl_selectors[i].fis_sel;
389*0886dcadSAndy Fiddaman err = 0;
390*0886dcadSAndy Fiddaman break;
391*0886dcadSAndy Fiddaman }
392*0886dcadSAndy Fiddaman }
393*0886dcadSAndy Fiddaman
394*0886dcadSAndy Fiddaman if (err != 0 && fletcher_4_initialized) {
395*0886dcadSAndy Fiddaman /* check all supported implementations */
396*0886dcadSAndy Fiddaman for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
397*0886dcadSAndy Fiddaman const char *name = fletcher_4_supp_impls[i]->name;
398*0886dcadSAndy Fiddaman
399*0886dcadSAndy Fiddaman if (strcmp(val, name) == 0) {
400*0886dcadSAndy Fiddaman impl = i;
401*0886dcadSAndy Fiddaman err = 0;
402*0886dcadSAndy Fiddaman break;
403*0886dcadSAndy Fiddaman }
404*0886dcadSAndy Fiddaman }
405*0886dcadSAndy Fiddaman }
406*0886dcadSAndy Fiddaman
407*0886dcadSAndy Fiddaman if (err == 0) {
408*0886dcadSAndy Fiddaman atomic_swap_32(&fletcher_4_impl_chosen, impl);
409*0886dcadSAndy Fiddaman MEMBAR_PRODUCER;
410*0886dcadSAndy Fiddaman }
411*0886dcadSAndy Fiddaman
412*0886dcadSAndy Fiddaman return (SET_ERROR(err));
413*0886dcadSAndy Fiddaman }
414*0886dcadSAndy Fiddaman
415*0886dcadSAndy Fiddaman /*
416*0886dcadSAndy Fiddaman * Returns the Fletcher 4 operations for checksums. When a SIMD
417*0886dcadSAndy Fiddaman * implementation is not allowed in the current context, then fallback
418*0886dcadSAndy Fiddaman * to the fastest generic implementation.
419*0886dcadSAndy Fiddaman */
420*0886dcadSAndy Fiddaman static inline const fletcher_4_ops_t *
fletcher_4_impl_get(void)421*0886dcadSAndy Fiddaman fletcher_4_impl_get(void)
422*0886dcadSAndy Fiddaman {
423*0886dcadSAndy Fiddaman if (!kfpu_allowed())
424*0886dcadSAndy Fiddaman return (&fletcher_4_superscalar4_ops);
425*0886dcadSAndy Fiddaman
426*0886dcadSAndy Fiddaman const fletcher_4_ops_t *ops = NULL;
427*0886dcadSAndy Fiddaman uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
428*0886dcadSAndy Fiddaman
429*0886dcadSAndy Fiddaman switch (impl) {
430*0886dcadSAndy Fiddaman case IMPL_FASTEST:
431*0886dcadSAndy Fiddaman ASSERT(fletcher_4_initialized);
432*0886dcadSAndy Fiddaman ops = &fletcher_4_fastest_impl;
433*0886dcadSAndy Fiddaman break;
434*0886dcadSAndy Fiddaman case IMPL_CYCLE:
435*0886dcadSAndy Fiddaman /* Cycle through supported implementations */
436*0886dcadSAndy Fiddaman ASSERT(fletcher_4_initialized);
437*0886dcadSAndy Fiddaman ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
438*0886dcadSAndy Fiddaman
439*0886dcadSAndy Fiddaman static uint32_t cycle_count = 0;
440*0886dcadSAndy Fiddaman uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt;
441*0886dcadSAndy Fiddaman
442*0886dcadSAndy Fiddaman ops = fletcher_4_supp_impls[idx];
443*0886dcadSAndy Fiddaman break;
444*0886dcadSAndy Fiddaman default:
445*0886dcadSAndy Fiddaman ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
446*0886dcadSAndy Fiddaman ASSERT3U(impl, <, fletcher_4_supp_impls_cnt);
447*0886dcadSAndy Fiddaman
448*0886dcadSAndy Fiddaman ops = fletcher_4_supp_impls[impl];
449*0886dcadSAndy Fiddaman break;
450*0886dcadSAndy Fiddaman }
451*0886dcadSAndy Fiddaman
452*0886dcadSAndy Fiddaman ASSERT3P(ops, !=, NULL);
453*0886dcadSAndy Fiddaman
454*0886dcadSAndy Fiddaman return (ops);
455*0886dcadSAndy Fiddaman }
456*0886dcadSAndy Fiddaman
457*0886dcadSAndy Fiddaman static inline void
fletcher_4_native_impl(const void * buf,size_t size,zio_cksum_t * zcp)458*0886dcadSAndy Fiddaman fletcher_4_native_impl(const void *buf, size_t size, zio_cksum_t *zcp)
459*0886dcadSAndy Fiddaman {
460*0886dcadSAndy Fiddaman fletcher_4_ctx_t ctx;
461*0886dcadSAndy Fiddaman const fletcher_4_ops_t *ops = fletcher_4_impl_get();
462*0886dcadSAndy Fiddaman
463*0886dcadSAndy Fiddaman if (ops->uses_fpu_native)
464*0886dcadSAndy Fiddaman kfpu_begin();
465*0886dcadSAndy Fiddaman ops->init_native(&ctx);
466*0886dcadSAndy Fiddaman ops->compute_native(&ctx, buf, size);
467*0886dcadSAndy Fiddaman ops->fini_native(&ctx, zcp);
468*0886dcadSAndy Fiddaman if (ops->uses_fpu_native)
469*0886dcadSAndy Fiddaman kfpu_end();
470*0886dcadSAndy Fiddaman }
471*0886dcadSAndy Fiddaman
472*0886dcadSAndy Fiddaman void
fletcher_4_native(const void * buf,size_t size,const void * ctx_template __unused,zio_cksum_t * zcp)473*0886dcadSAndy Fiddaman fletcher_4_native(const void *buf, size_t size,
474*0886dcadSAndy Fiddaman const void *ctx_template __unused, zio_cksum_t *zcp)
475*0886dcadSAndy Fiddaman {
476*0886dcadSAndy Fiddaman const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
477*0886dcadSAndy Fiddaman
478*0886dcadSAndy Fiddaman ASSERT(IS_P2ALIGNED(buf, sizeof (uint32_t)));
479*0886dcadSAndy Fiddaman ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
480*0886dcadSAndy Fiddaman
481*0886dcadSAndy Fiddaman if (size == 0 || p2size == 0) {
482*0886dcadSAndy Fiddaman ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
483*0886dcadSAndy Fiddaman
484*0886dcadSAndy Fiddaman if (size > 0) {
485*0886dcadSAndy Fiddaman fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp,
486*0886dcadSAndy Fiddaman buf, size);
487*0886dcadSAndy Fiddaman }
488*0886dcadSAndy Fiddaman } else {
489*0886dcadSAndy Fiddaman fletcher_4_native_impl(buf, p2size, zcp);
490*0886dcadSAndy Fiddaman
491*0886dcadSAndy Fiddaman if (p2size < size) {
492*0886dcadSAndy Fiddaman fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp,
493*0886dcadSAndy Fiddaman (char *)buf + p2size, size - p2size);
494*0886dcadSAndy Fiddaman }
495*0886dcadSAndy Fiddaman }
496*0886dcadSAndy Fiddaman }
497*0886dcadSAndy Fiddaman
498*0886dcadSAndy Fiddaman void
fletcher_4_native_varsize(const void * buf,size_t size,zio_cksum_t * zcp)499*0886dcadSAndy Fiddaman fletcher_4_native_varsize(const void *buf, size_t size, zio_cksum_t *zcp)
500*0886dcadSAndy Fiddaman {
501*0886dcadSAndy Fiddaman ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
502*0886dcadSAndy Fiddaman fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size);
503*0886dcadSAndy Fiddaman }
504*0886dcadSAndy Fiddaman
505*0886dcadSAndy Fiddaman static inline void
fletcher_4_byteswap_impl(const void * buf,size_t size,zio_cksum_t * zcp)506*0886dcadSAndy Fiddaman fletcher_4_byteswap_impl(const void *buf, size_t size, zio_cksum_t *zcp)
507*0886dcadSAndy Fiddaman {
508*0886dcadSAndy Fiddaman fletcher_4_ctx_t ctx;
509*0886dcadSAndy Fiddaman const fletcher_4_ops_t *ops = fletcher_4_impl_get();
510*0886dcadSAndy Fiddaman
511*0886dcadSAndy Fiddaman if (ops->uses_fpu_byteswap)
512*0886dcadSAndy Fiddaman kfpu_begin();
513*0886dcadSAndy Fiddaman ops->init_byteswap(&ctx);
514*0886dcadSAndy Fiddaman ops->compute_byteswap(&ctx, buf, size);
515*0886dcadSAndy Fiddaman ops->fini_byteswap(&ctx, zcp);
516*0886dcadSAndy Fiddaman if (ops->uses_fpu_byteswap)
517*0886dcadSAndy Fiddaman kfpu_end();
518*0886dcadSAndy Fiddaman }
519*0886dcadSAndy Fiddaman
520*0886dcadSAndy Fiddaman void
fletcher_4_byteswap(const void * buf,size_t size,const void * ctx_template __unused,zio_cksum_t * zcp)521*0886dcadSAndy Fiddaman fletcher_4_byteswap(const void *buf, size_t size,
522*0886dcadSAndy Fiddaman const void *ctx_template __unused, zio_cksum_t *zcp)
523*0886dcadSAndy Fiddaman {
524*0886dcadSAndy Fiddaman const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
525*0886dcadSAndy Fiddaman
526*0886dcadSAndy Fiddaman ASSERT(IS_P2ALIGNED(buf, sizeof (uint32_t)));
527*0886dcadSAndy Fiddaman ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
528*0886dcadSAndy Fiddaman
529*0886dcadSAndy Fiddaman if (size == 0 || p2size == 0) {
530*0886dcadSAndy Fiddaman ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
531*0886dcadSAndy Fiddaman
532*0886dcadSAndy Fiddaman if (size > 0) {
533*0886dcadSAndy Fiddaman fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp,
534*0886dcadSAndy Fiddaman buf, size);
535*0886dcadSAndy Fiddaman }
536*0886dcadSAndy Fiddaman } else {
537*0886dcadSAndy Fiddaman fletcher_4_byteswap_impl(buf, p2size, zcp);
538*0886dcadSAndy Fiddaman
539*0886dcadSAndy Fiddaman if (p2size < size) {
540*0886dcadSAndy Fiddaman fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp,
541*0886dcadSAndy Fiddaman (char *)buf + p2size, size - p2size);
542*0886dcadSAndy Fiddaman }
543*0886dcadSAndy Fiddaman }
544*0886dcadSAndy Fiddaman }
545*0886dcadSAndy Fiddaman
546*0886dcadSAndy Fiddaman /* Incremental Fletcher 4 */
547*0886dcadSAndy Fiddaman
548*0886dcadSAndy Fiddaman #define ZFS_FLETCHER_4_INC_MAX_SIZE (8ULL << 20)
549*0886dcadSAndy Fiddaman
550*0886dcadSAndy Fiddaman static inline void
fletcher_4_incremental_combine(zio_cksum_t * zcp,const size_t size,const zio_cksum_t * nzcp)551*0886dcadSAndy Fiddaman fletcher_4_incremental_combine(zio_cksum_t *zcp, const size_t size,
552*0886dcadSAndy Fiddaman const zio_cksum_t *nzcp)
553*0886dcadSAndy Fiddaman {
554*0886dcadSAndy Fiddaman const uint64_t c1 = size / sizeof (uint32_t);
555*0886dcadSAndy Fiddaman const uint64_t c2 = c1 * (c1 + 1) / 2;
556*0886dcadSAndy Fiddaman const uint64_t c3 = c2 * (c1 + 2) / 3;
557*0886dcadSAndy Fiddaman
558*0886dcadSAndy Fiddaman /*
559*0886dcadSAndy Fiddaman * Value of 'c3' overflows on buffer sizes close to 16MiB. For that
560*0886dcadSAndy Fiddaman * reason we split incremental fletcher4 computation of large buffers
561*0886dcadSAndy Fiddaman * to steps of (ZFS_FLETCHER_4_INC_MAX_SIZE) size.
562*0886dcadSAndy Fiddaman */
563*0886dcadSAndy Fiddaman ASSERT3U(size, <=, ZFS_FLETCHER_4_INC_MAX_SIZE);
564*0886dcadSAndy Fiddaman
565*0886dcadSAndy Fiddaman zcp->zc_word[3] += nzcp->zc_word[3] + c1 * zcp->zc_word[2] +
566*0886dcadSAndy Fiddaman c2 * zcp->zc_word[1] + c3 * zcp->zc_word[0];
567*0886dcadSAndy Fiddaman zcp->zc_word[2] += nzcp->zc_word[2] + c1 * zcp->zc_word[1] +
568*0886dcadSAndy Fiddaman c2 * zcp->zc_word[0];
569*0886dcadSAndy Fiddaman zcp->zc_word[1] += nzcp->zc_word[1] + c1 * zcp->zc_word[0];
570*0886dcadSAndy Fiddaman zcp->zc_word[0] += nzcp->zc_word[0];
571*0886dcadSAndy Fiddaman }
572*0886dcadSAndy Fiddaman
573*0886dcadSAndy Fiddaman static inline void
fletcher_4_incremental_impl(boolean_t native,const void * buf,size_t size,zio_cksum_t * zcp)574*0886dcadSAndy Fiddaman fletcher_4_incremental_impl(boolean_t native, const void *buf, size_t size,
575*0886dcadSAndy Fiddaman zio_cksum_t *zcp)
576*0886dcadSAndy Fiddaman {
577*0886dcadSAndy Fiddaman while (size > 0) {
578*0886dcadSAndy Fiddaman zio_cksum_t nzc;
579*0886dcadSAndy Fiddaman uint64_t len = MIN(size, ZFS_FLETCHER_4_INC_MAX_SIZE);
580*0886dcadSAndy Fiddaman
581*0886dcadSAndy Fiddaman if (native)
582*0886dcadSAndy Fiddaman fletcher_4_native(buf, len, NULL, &nzc);
583*0886dcadSAndy Fiddaman else
584*0886dcadSAndy Fiddaman fletcher_4_byteswap(buf, len, NULL, &nzc);
585*0886dcadSAndy Fiddaman
586*0886dcadSAndy Fiddaman fletcher_4_incremental_combine(zcp, len, &nzc);
587*0886dcadSAndy Fiddaman
588*0886dcadSAndy Fiddaman size -= len;
589*0886dcadSAndy Fiddaman buf += len;
590*0886dcadSAndy Fiddaman }
591*0886dcadSAndy Fiddaman }
592*0886dcadSAndy Fiddaman
593*0886dcadSAndy Fiddaman int
fletcher_4_incremental_native(void * buf,size_t size,void * data)594*0886dcadSAndy Fiddaman fletcher_4_incremental_native(void *buf, size_t size, void *data)
595*0886dcadSAndy Fiddaman {
596*0886dcadSAndy Fiddaman zio_cksum_t *zcp = data;
597*0886dcadSAndy Fiddaman
598*0886dcadSAndy Fiddaman /* Use scalar impl to directly update cksum of small blocks */
599*0886dcadSAndy Fiddaman if (size < SPA_MINBLOCKSIZE)
600*0886dcadSAndy Fiddaman fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size);
601*0886dcadSAndy Fiddaman else
602*0886dcadSAndy Fiddaman fletcher_4_incremental_impl(B_TRUE, buf, size, zcp);
603770499e1SDan Kimmel return (0);
604770499e1SDan Kimmel }
605770499e1SDan Kimmel
606*0886dcadSAndy Fiddaman int
fletcher_4_incremental_byteswap(void * buf,size_t size,void * data)607*0886dcadSAndy Fiddaman fletcher_4_incremental_byteswap(void *buf, size_t size, void *data)
608770499e1SDan Kimmel {
609*0886dcadSAndy Fiddaman zio_cksum_t *zcp = data;
610*0886dcadSAndy Fiddaman
611*0886dcadSAndy Fiddaman /* Use scalar impl to directly update cksum of small blocks */
612*0886dcadSAndy Fiddaman if (size < SPA_MINBLOCKSIZE)
613*0886dcadSAndy Fiddaman fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, buf, size);
614*0886dcadSAndy Fiddaman else
615*0886dcadSAndy Fiddaman fletcher_4_incremental_impl(B_FALSE, buf, size, zcp);
616*0886dcadSAndy Fiddaman return (0);
617495db6fbSLori Alt }
618*0886dcadSAndy Fiddaman
619*0886dcadSAndy Fiddaman #define FLETCHER_4_FASTEST_FN_COPY(type, src) \
620*0886dcadSAndy Fiddaman { \
621*0886dcadSAndy Fiddaman fletcher_4_fastest_impl.init_ ## type = src->init_ ## type; \
622*0886dcadSAndy Fiddaman fletcher_4_fastest_impl.fini_ ## type = src->fini_ ## type; \
623*0886dcadSAndy Fiddaman fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \
624*0886dcadSAndy Fiddaman fletcher_4_fastest_impl.uses_fpu_ ## type = src->uses_fpu_ ## type; \
625*0886dcadSAndy Fiddaman }
626*0886dcadSAndy Fiddaman
627*0886dcadSAndy Fiddaman #define FLETCHER_4_BENCH_NS (MSEC2NSEC(1)) /* 1ms */
628*0886dcadSAndy Fiddaman
629*0886dcadSAndy Fiddaman typedef void fletcher_checksum_func_t(const void *, size_t, const void *,
630*0886dcadSAndy Fiddaman zio_cksum_t *);
631*0886dcadSAndy Fiddaman
632*0886dcadSAndy Fiddaman #if defined(_KERNEL)
633*0886dcadSAndy Fiddaman static void
fletcher_4_benchmark_impl(boolean_t native,char * data,size_t data_size)634*0886dcadSAndy Fiddaman fletcher_4_benchmark_impl(boolean_t native, char *data, size_t data_size)
635*0886dcadSAndy Fiddaman {
636*0886dcadSAndy Fiddaman struct fletcher_4_bench *fastest_stat =
637*0886dcadSAndy Fiddaman &fletcher_4_stat_data[fletcher_4_supp_impls_cnt];
638*0886dcadSAndy Fiddaman hrtime_t start;
639*0886dcadSAndy Fiddaman uint64_t run_bw, run_time_ns, best_run = 0;
640*0886dcadSAndy Fiddaman zio_cksum_t zc;
641*0886dcadSAndy Fiddaman uint32_t i, l, sel_save = IMPL_READ(fletcher_4_impl_chosen);
642*0886dcadSAndy Fiddaman
643*0886dcadSAndy Fiddaman fletcher_checksum_func_t *fletcher_4_test =
644*0886dcadSAndy Fiddaman native ? fletcher_4_native : fletcher_4_byteswap;
645*0886dcadSAndy Fiddaman
646*0886dcadSAndy Fiddaman for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
647*0886dcadSAndy Fiddaman struct fletcher_4_bench *stat = &fletcher_4_stat_data[i];
648*0886dcadSAndy Fiddaman uint64_t run_count = 0;
649*0886dcadSAndy Fiddaman
650*0886dcadSAndy Fiddaman /* Temporarily set an implementation */
651*0886dcadSAndy Fiddaman fletcher_4_impl_chosen = i;
652*0886dcadSAndy Fiddaman
653*0886dcadSAndy Fiddaman KPREEMPT_DISABLE;
654*0886dcadSAndy Fiddaman start = gethrtime();
655*0886dcadSAndy Fiddaman do {
656*0886dcadSAndy Fiddaman for (l = 0; l < 32; l++, run_count++)
657*0886dcadSAndy Fiddaman fletcher_4_test(data, data_size, NULL, &zc);
658*0886dcadSAndy Fiddaman
659*0886dcadSAndy Fiddaman run_time_ns = gethrtime() - start;
660*0886dcadSAndy Fiddaman } while (run_time_ns < FLETCHER_4_BENCH_NS);
661*0886dcadSAndy Fiddaman KPREEMPT_ENABLE;
662*0886dcadSAndy Fiddaman
663*0886dcadSAndy Fiddaman run_bw = data_size * run_count * NANOSEC;
664*0886dcadSAndy Fiddaman run_bw /= run_time_ns; /* B/s */
665*0886dcadSAndy Fiddaman
666*0886dcadSAndy Fiddaman if (native)
667*0886dcadSAndy Fiddaman stat->native = run_bw;
668*0886dcadSAndy Fiddaman else
669*0886dcadSAndy Fiddaman stat->byteswap = run_bw;
670*0886dcadSAndy Fiddaman
671*0886dcadSAndy Fiddaman if (run_bw > best_run) {
672*0886dcadSAndy Fiddaman best_run = run_bw;
673*0886dcadSAndy Fiddaman
674*0886dcadSAndy Fiddaman if (native) {
675*0886dcadSAndy Fiddaman fastest_stat->native = i;
676*0886dcadSAndy Fiddaman FLETCHER_4_FASTEST_FN_COPY(native,
677*0886dcadSAndy Fiddaman fletcher_4_supp_impls[i]);
678*0886dcadSAndy Fiddaman } else {
679*0886dcadSAndy Fiddaman fastest_stat->byteswap = i;
680*0886dcadSAndy Fiddaman FLETCHER_4_FASTEST_FN_COPY(byteswap,
681*0886dcadSAndy Fiddaman fletcher_4_supp_impls[i]);
682*0886dcadSAndy Fiddaman }
683*0886dcadSAndy Fiddaman }
684*0886dcadSAndy Fiddaman }
685*0886dcadSAndy Fiddaman
686*0886dcadSAndy Fiddaman /* restore original selection */
687*0886dcadSAndy Fiddaman atomic_swap_32(&fletcher_4_impl_chosen, sel_save);
688*0886dcadSAndy Fiddaman }
689*0886dcadSAndy Fiddaman #endif /* _KERNEL */
690*0886dcadSAndy Fiddaman
691*0886dcadSAndy Fiddaman /*
692*0886dcadSAndy Fiddaman * Initialize and benchmark all supported implementations.
693*0886dcadSAndy Fiddaman */
694*0886dcadSAndy Fiddaman static void
fletcher_4_benchmark(void)695*0886dcadSAndy Fiddaman fletcher_4_benchmark(void)
696*0886dcadSAndy Fiddaman {
697*0886dcadSAndy Fiddaman fletcher_4_ops_t *curr_impl;
698*0886dcadSAndy Fiddaman int i, c;
699*0886dcadSAndy Fiddaman
700*0886dcadSAndy Fiddaman /* Move supported implementations into fletcher_4_supp_impls */
701*0886dcadSAndy Fiddaman for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) {
702*0886dcadSAndy Fiddaman curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i];
703*0886dcadSAndy Fiddaman
704*0886dcadSAndy Fiddaman if (curr_impl->valid && curr_impl->valid())
705*0886dcadSAndy Fiddaman fletcher_4_supp_impls[c++] = curr_impl;
706*0886dcadSAndy Fiddaman }
707*0886dcadSAndy Fiddaman MEMBAR_PRODUCER; /* complete fletcher_4_supp_impls[] init */
708*0886dcadSAndy Fiddaman fletcher_4_supp_impls_cnt = c; /* number of supported impl */
709*0886dcadSAndy Fiddaman
710*0886dcadSAndy Fiddaman #if defined(_KERNEL)
711*0886dcadSAndy Fiddaman static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
712*0886dcadSAndy Fiddaman char *databuf = kmem_alloc(data_size, KM_SLEEP);
713*0886dcadSAndy Fiddaman
714*0886dcadSAndy Fiddaman for (i = 0; i < data_size / sizeof (uint64_t); i++)
715*0886dcadSAndy Fiddaman ((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */
716*0886dcadSAndy Fiddaman
717*0886dcadSAndy Fiddaman fletcher_4_benchmark_impl(B_FALSE, databuf, data_size);
718*0886dcadSAndy Fiddaman fletcher_4_benchmark_impl(B_TRUE, databuf, data_size);
719*0886dcadSAndy Fiddaman
720*0886dcadSAndy Fiddaman kmem_free(databuf, data_size);
721*0886dcadSAndy Fiddaman #else
722*0886dcadSAndy Fiddaman /*
723*0886dcadSAndy Fiddaman * Skip the benchmark in user space to avoid impacting libzpool
724*0886dcadSAndy Fiddaman * consumers (zdb, zhack, zinject, ztest). The last implementation
725*0886dcadSAndy Fiddaman * is assumed to be the fastest and used by default.
726*0886dcadSAndy Fiddaman */
727*0886dcadSAndy Fiddaman memcpy(&fletcher_4_fastest_impl,
728*0886dcadSAndy Fiddaman fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1],
729*0886dcadSAndy Fiddaman sizeof (fletcher_4_fastest_impl));
730*0886dcadSAndy Fiddaman fletcher_4_fastest_impl.name = "fastest";
731*0886dcadSAndy Fiddaman #endif /* _KERNEL */
732*0886dcadSAndy Fiddaman }
733*0886dcadSAndy Fiddaman
734*0886dcadSAndy Fiddaman void
fletcher_4_init(void)735*0886dcadSAndy Fiddaman fletcher_4_init(void)
736*0886dcadSAndy Fiddaman {
737*0886dcadSAndy Fiddaman /* Determine the fastest available implementation. */
738*0886dcadSAndy Fiddaman fletcher_4_benchmark();
739*0886dcadSAndy Fiddaman
740*0886dcadSAndy Fiddaman #if defined(_KERNEL)
741*0886dcadSAndy Fiddaman /* install kstats for all implementations */
742*0886dcadSAndy Fiddaman for (uint32_t i = 0; i < fletcher_4_supp_impls_cnt; i++) {
743*0886dcadSAndy Fiddaman struct fletcher_4_bench *stat = &fletcher_4_stat_data[i];
744*0886dcadSAndy Fiddaman const fletcher_4_ops_t *ops = fletcher_4_supp_impls[i];
745*0886dcadSAndy Fiddaman kstat_named_t *kstat_native = &fletcher_4_kstat_data[i * 2];
746*0886dcadSAndy Fiddaman kstat_named_t *kstat_byteswap =
747*0886dcadSAndy Fiddaman &fletcher_4_kstat_data[i * 2 + 1];
748*0886dcadSAndy Fiddaman
749*0886dcadSAndy Fiddaman (void) snprintf(kstat_native->name,
750*0886dcadSAndy Fiddaman sizeof (kstat_native->name), "%s_native", ops->name);
751*0886dcadSAndy Fiddaman kstat_native->data_type = KSTAT_DATA_UINT64;
752*0886dcadSAndy Fiddaman kstat_native->value.ui64 = stat->native;
753*0886dcadSAndy Fiddaman
754*0886dcadSAndy Fiddaman (void) snprintf(kstat_byteswap->name,
755*0886dcadSAndy Fiddaman sizeof (kstat_byteswap->name), "%s_byteswap", ops->name);
756*0886dcadSAndy Fiddaman kstat_byteswap->data_type = KSTAT_DATA_UINT64;
757*0886dcadSAndy Fiddaman kstat_byteswap->value.ui64 = stat->byteswap;
758*0886dcadSAndy Fiddaman }
759*0886dcadSAndy Fiddaman
760*0886dcadSAndy Fiddaman fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc",
761*0886dcadSAndy Fiddaman KSTAT_TYPE_NAMED, ARRAY_SIZE(fletcher_4_supp_impls) * 2,
762*0886dcadSAndy Fiddaman KSTAT_FLAG_VIRTUAL);
763*0886dcadSAndy Fiddaman
764*0886dcadSAndy Fiddaman if (fletcher_4_kstat != NULL) {
765*0886dcadSAndy Fiddaman fletcher_4_kstat->ks_data = fletcher_4_kstat_data;
766*0886dcadSAndy Fiddaman kstat_install(fletcher_4_kstat);
767*0886dcadSAndy Fiddaman }
768*0886dcadSAndy Fiddaman #endif
769*0886dcadSAndy Fiddaman
770*0886dcadSAndy Fiddaman /* Finish initialization */
771*0886dcadSAndy Fiddaman fletcher_4_initialized = B_TRUE;
772*0886dcadSAndy Fiddaman }
773*0886dcadSAndy Fiddaman
774*0886dcadSAndy Fiddaman void
fletcher_4_fini(void)775*0886dcadSAndy Fiddaman fletcher_4_fini(void)
776*0886dcadSAndy Fiddaman {
777*0886dcadSAndy Fiddaman #if defined(_KERNEL)
778*0886dcadSAndy Fiddaman if (fletcher_4_kstat != NULL) {
779*0886dcadSAndy Fiddaman kstat_delete(fletcher_4_kstat);
780*0886dcadSAndy Fiddaman fletcher_4_kstat = NULL;
781*0886dcadSAndy Fiddaman }
782*0886dcadSAndy Fiddaman #endif
783*0886dcadSAndy Fiddaman }
784*0886dcadSAndy Fiddaman
785*0886dcadSAndy Fiddaman /* ABD adapters */
786*0886dcadSAndy Fiddaman
787*0886dcadSAndy Fiddaman static void
abd_fletcher_4_init(zio_abd_checksum_data_t * cdp)788*0886dcadSAndy Fiddaman abd_fletcher_4_init(zio_abd_checksum_data_t *cdp)
789*0886dcadSAndy Fiddaman {
790*0886dcadSAndy Fiddaman const fletcher_4_ops_t *ops = fletcher_4_impl_get();
791*0886dcadSAndy Fiddaman cdp->acd_private = (void *) ops;
792*0886dcadSAndy Fiddaman
793*0886dcadSAndy Fiddaman if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE) {
794*0886dcadSAndy Fiddaman if (ops->uses_fpu_native)
795*0886dcadSAndy Fiddaman kfpu_begin();
796*0886dcadSAndy Fiddaman ops->init_native(cdp->acd_ctx);
797*0886dcadSAndy Fiddaman } else {
798*0886dcadSAndy Fiddaman if (ops->uses_fpu_byteswap)
799*0886dcadSAndy Fiddaman kfpu_begin();
800*0886dcadSAndy Fiddaman ops->init_byteswap(cdp->acd_ctx);
801*0886dcadSAndy Fiddaman }
802*0886dcadSAndy Fiddaman }
803*0886dcadSAndy Fiddaman
804*0886dcadSAndy Fiddaman static void
abd_fletcher_4_fini(zio_abd_checksum_data_t * cdp)805*0886dcadSAndy Fiddaman abd_fletcher_4_fini(zio_abd_checksum_data_t *cdp)
806*0886dcadSAndy Fiddaman {
807*0886dcadSAndy Fiddaman fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private;
808*0886dcadSAndy Fiddaman
809*0886dcadSAndy Fiddaman ASSERT(ops);
810*0886dcadSAndy Fiddaman
811*0886dcadSAndy Fiddaman if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE) {
812*0886dcadSAndy Fiddaman ops->fini_native(cdp->acd_ctx, cdp->acd_zcp);
813*0886dcadSAndy Fiddaman if (ops->uses_fpu_native)
814*0886dcadSAndy Fiddaman kfpu_end();
815*0886dcadSAndy Fiddaman } else {
816*0886dcadSAndy Fiddaman ops->fini_byteswap(cdp->acd_ctx, cdp->acd_zcp);
817*0886dcadSAndy Fiddaman if (ops->uses_fpu_byteswap)
818*0886dcadSAndy Fiddaman kfpu_end();
819*0886dcadSAndy Fiddaman }
820*0886dcadSAndy Fiddaman }
821*0886dcadSAndy Fiddaman
822*0886dcadSAndy Fiddaman static void
abd_fletcher_4_simd2scalar(boolean_t native,void * data,size_t size,zio_abd_checksum_data_t * cdp)823*0886dcadSAndy Fiddaman abd_fletcher_4_simd2scalar(boolean_t native, void *data, size_t size,
824*0886dcadSAndy Fiddaman zio_abd_checksum_data_t *cdp)
825*0886dcadSAndy Fiddaman {
826*0886dcadSAndy Fiddaman zio_cksum_t *zcp = cdp->acd_zcp;
827*0886dcadSAndy Fiddaman
828*0886dcadSAndy Fiddaman ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE);
829*0886dcadSAndy Fiddaman
830*0886dcadSAndy Fiddaman abd_fletcher_4_fini(cdp);
831*0886dcadSAndy Fiddaman cdp->acd_private = (void *)&fletcher_4_scalar_ops;
832*0886dcadSAndy Fiddaman
833*0886dcadSAndy Fiddaman if (native)
834*0886dcadSAndy Fiddaman fletcher_4_incremental_native(data, size, zcp);
835*0886dcadSAndy Fiddaman else
836*0886dcadSAndy Fiddaman fletcher_4_incremental_byteswap(data, size, zcp);
837*0886dcadSAndy Fiddaman }
838*0886dcadSAndy Fiddaman
839*0886dcadSAndy Fiddaman static int
abd_fletcher_4_iter(void * data,size_t size,void * private)840*0886dcadSAndy Fiddaman abd_fletcher_4_iter(void *data, size_t size, void *private)
841*0886dcadSAndy Fiddaman {
842*0886dcadSAndy Fiddaman zio_abd_checksum_data_t *cdp = (zio_abd_checksum_data_t *)private;
843*0886dcadSAndy Fiddaman fletcher_4_ctx_t *ctx = cdp->acd_ctx;
844*0886dcadSAndy Fiddaman fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private;
845*0886dcadSAndy Fiddaman boolean_t native = cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE;
846*0886dcadSAndy Fiddaman uint64_t asize = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE);
847*0886dcadSAndy Fiddaman
848*0886dcadSAndy Fiddaman ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
849*0886dcadSAndy Fiddaman
850*0886dcadSAndy Fiddaman if (asize > 0) {
851*0886dcadSAndy Fiddaman if (native)
852*0886dcadSAndy Fiddaman ops->compute_native(ctx, data, asize);
853*0886dcadSAndy Fiddaman else
854*0886dcadSAndy Fiddaman ops->compute_byteswap(ctx, data, asize);
855*0886dcadSAndy Fiddaman
856*0886dcadSAndy Fiddaman size -= asize;
857*0886dcadSAndy Fiddaman data = (char *)data + asize;
858*0886dcadSAndy Fiddaman }
859*0886dcadSAndy Fiddaman
860*0886dcadSAndy Fiddaman if (size > 0) {
861*0886dcadSAndy Fiddaman ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE);
862*0886dcadSAndy Fiddaman /* At this point we have to switch to scalar impl */
863*0886dcadSAndy Fiddaman abd_fletcher_4_simd2scalar(native, data, size, cdp);
864*0886dcadSAndy Fiddaman }
865*0886dcadSAndy Fiddaman
866*0886dcadSAndy Fiddaman return (0);
867*0886dcadSAndy Fiddaman }
868*0886dcadSAndy Fiddaman
869*0886dcadSAndy Fiddaman zio_abd_checksum_func_t fletcher_4_abd_ops = {
870*0886dcadSAndy Fiddaman .acf_init = abd_fletcher_4_init,
871*0886dcadSAndy Fiddaman .acf_fini = abd_fletcher_4_fini,
872*0886dcadSAndy Fiddaman .acf_iter = abd_fletcher_4_iter
873*0886dcadSAndy Fiddaman };
874