1*eda14cbcSMatt Macy /* 2*eda14cbcSMatt Macy * Implement fast Fletcher4 using superscalar pipelines. 3*eda14cbcSMatt Macy * 4*eda14cbcSMatt Macy * Use regular C code to compute 5*eda14cbcSMatt Macy * Fletcher4 in two incremental 64-bit parallel accumulator streams, 6*eda14cbcSMatt Macy * and then combine the streams to form the final four checksum words. 7*eda14cbcSMatt Macy * This implementation is a derivative of the AVX SIMD implementation by 8*eda14cbcSMatt Macy * James Guilford and Jinshan Xiong from Intel (see zfs_fletcher_intel.c). 9*eda14cbcSMatt Macy * 10*eda14cbcSMatt Macy * Copyright (C) 2016 Romain Dolbeau. 11*eda14cbcSMatt Macy * 12*eda14cbcSMatt Macy * Authors: 13*eda14cbcSMatt Macy * Romain Dolbeau <romain.dolbeau@atos.net> 14*eda14cbcSMatt Macy * 15*eda14cbcSMatt Macy * This software is available to you under a choice of one of two 16*eda14cbcSMatt Macy * licenses. You may choose to be licensed under the terms of the GNU 17*eda14cbcSMatt Macy * General Public License (GPL) Version 2, available from the file 18*eda14cbcSMatt Macy * COPYING in the main directory of this source tree, or the 19*eda14cbcSMatt Macy * OpenIB.org BSD license below: 20*eda14cbcSMatt Macy * 21*eda14cbcSMatt Macy * Redistribution and use in source and binary forms, with or 22*eda14cbcSMatt Macy * without modification, are permitted provided that the following 23*eda14cbcSMatt Macy * conditions are met: 24*eda14cbcSMatt Macy * 25*eda14cbcSMatt Macy * - Redistributions of source code must retain the above 26*eda14cbcSMatt Macy * copyright notice, this list of conditions and the following 27*eda14cbcSMatt Macy * disclaimer. 28*eda14cbcSMatt Macy * 29*eda14cbcSMatt Macy * - Redistributions in binary form must reproduce the above 30*eda14cbcSMatt Macy * copyright notice, this list of conditions and the following 31*eda14cbcSMatt Macy * disclaimer in the documentation and/or other materials 32*eda14cbcSMatt Macy * provided with the distribution. 33*eda14cbcSMatt Macy * 34*eda14cbcSMatt Macy * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 35*eda14cbcSMatt Macy * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 36*eda14cbcSMatt Macy * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 37*eda14cbcSMatt Macy * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 38*eda14cbcSMatt Macy * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 39*eda14cbcSMatt Macy * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 40*eda14cbcSMatt Macy * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 41*eda14cbcSMatt Macy * SOFTWARE. 42*eda14cbcSMatt Macy */ 43*eda14cbcSMatt Macy 44*eda14cbcSMatt Macy #include <sys/param.h> 45*eda14cbcSMatt Macy #include <sys/byteorder.h> 46*eda14cbcSMatt Macy #include <sys/spa_checksum.h> 47*eda14cbcSMatt Macy #include <sys/strings.h> 48*eda14cbcSMatt Macy #include <zfs_fletcher.h> 49*eda14cbcSMatt Macy 50*eda14cbcSMatt Macy static void 51*eda14cbcSMatt Macy fletcher_4_superscalar_init(fletcher_4_ctx_t *ctx) 52*eda14cbcSMatt Macy { 53*eda14cbcSMatt Macy bzero(ctx->superscalar, 4 * sizeof (zfs_fletcher_superscalar_t)); 54*eda14cbcSMatt Macy } 55*eda14cbcSMatt Macy 56*eda14cbcSMatt Macy static void 57*eda14cbcSMatt Macy fletcher_4_superscalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) 58*eda14cbcSMatt Macy { 59*eda14cbcSMatt Macy uint64_t A, B, C, D; 60*eda14cbcSMatt Macy A = ctx->superscalar[0].v[0] + ctx->superscalar[0].v[1]; 61*eda14cbcSMatt Macy B = 2 * ctx->superscalar[1].v[0] + 2 * ctx->superscalar[1].v[1] - 62*eda14cbcSMatt Macy ctx->superscalar[0].v[1]; 63*eda14cbcSMatt Macy C = 4 * ctx->superscalar[2].v[0] - ctx->superscalar[1].v[0] + 64*eda14cbcSMatt Macy 4 * ctx->superscalar[2].v[1] - 3 * ctx->superscalar[1].v[1]; 65*eda14cbcSMatt Macy D = 8 * ctx->superscalar[3].v[0] - 4 * ctx->superscalar[2].v[0] + 66*eda14cbcSMatt Macy 8 * ctx->superscalar[3].v[1] - 8 * ctx->superscalar[2].v[1] + 67*eda14cbcSMatt Macy ctx->superscalar[1].v[1]; 68*eda14cbcSMatt Macy ZIO_SET_CHECKSUM(zcp, A, B, C, D); 69*eda14cbcSMatt Macy } 70*eda14cbcSMatt Macy 71*eda14cbcSMatt Macy static void 72*eda14cbcSMatt Macy fletcher_4_superscalar_native(fletcher_4_ctx_t *ctx, 73*eda14cbcSMatt Macy const void *buf, uint64_t size) 74*eda14cbcSMatt Macy { 75*eda14cbcSMatt Macy const uint32_t *ip = buf; 76*eda14cbcSMatt Macy const uint32_t *ipend = ip + (size / sizeof (uint32_t)); 77*eda14cbcSMatt Macy uint64_t a, b, c, d; 78*eda14cbcSMatt Macy uint64_t a2, b2, c2, d2; 79*eda14cbcSMatt Macy 80*eda14cbcSMatt Macy a = ctx->superscalar[0].v[0]; 81*eda14cbcSMatt Macy b = ctx->superscalar[1].v[0]; 82*eda14cbcSMatt Macy c = ctx->superscalar[2].v[0]; 83*eda14cbcSMatt Macy d = ctx->superscalar[3].v[0]; 84*eda14cbcSMatt Macy a2 = ctx->superscalar[0].v[1]; 85*eda14cbcSMatt Macy b2 = ctx->superscalar[1].v[1]; 86*eda14cbcSMatt Macy c2 = ctx->superscalar[2].v[1]; 87*eda14cbcSMatt Macy d2 = ctx->superscalar[3].v[1]; 88*eda14cbcSMatt Macy 89*eda14cbcSMatt Macy for (; ip < ipend; ip += 2) { 90*eda14cbcSMatt Macy a += ip[0]; 91*eda14cbcSMatt Macy a2 += ip[1]; 92*eda14cbcSMatt Macy b += a; 93*eda14cbcSMatt Macy b2 += a2; 94*eda14cbcSMatt Macy c += b; 95*eda14cbcSMatt Macy c2 += b2; 96*eda14cbcSMatt Macy d += c; 97*eda14cbcSMatt Macy d2 += c2; 98*eda14cbcSMatt Macy } 99*eda14cbcSMatt Macy 100*eda14cbcSMatt Macy ctx->superscalar[0].v[0] = a; 101*eda14cbcSMatt Macy ctx->superscalar[1].v[0] = b; 102*eda14cbcSMatt Macy ctx->superscalar[2].v[0] = c; 103*eda14cbcSMatt Macy ctx->superscalar[3].v[0] = d; 104*eda14cbcSMatt Macy ctx->superscalar[0].v[1] = a2; 105*eda14cbcSMatt Macy ctx->superscalar[1].v[1] = b2; 106*eda14cbcSMatt Macy ctx->superscalar[2].v[1] = c2; 107*eda14cbcSMatt Macy ctx->superscalar[3].v[1] = d2; 108*eda14cbcSMatt Macy } 109*eda14cbcSMatt Macy 110*eda14cbcSMatt Macy static void 111*eda14cbcSMatt Macy fletcher_4_superscalar_byteswap(fletcher_4_ctx_t *ctx, 112*eda14cbcSMatt Macy const void *buf, uint64_t size) 113*eda14cbcSMatt Macy { 114*eda14cbcSMatt Macy const uint32_t *ip = buf; 115*eda14cbcSMatt Macy const uint32_t *ipend = ip + (size / sizeof (uint32_t)); 116*eda14cbcSMatt Macy uint64_t a, b, c, d; 117*eda14cbcSMatt Macy uint64_t a2, b2, c2, d2; 118*eda14cbcSMatt Macy 119*eda14cbcSMatt Macy a = ctx->superscalar[0].v[0]; 120*eda14cbcSMatt Macy b = ctx->superscalar[1].v[0]; 121*eda14cbcSMatt Macy c = ctx->superscalar[2].v[0]; 122*eda14cbcSMatt Macy d = ctx->superscalar[3].v[0]; 123*eda14cbcSMatt Macy a2 = ctx->superscalar[0].v[1]; 124*eda14cbcSMatt Macy b2 = ctx->superscalar[1].v[1]; 125*eda14cbcSMatt Macy c2 = ctx->superscalar[2].v[1]; 126*eda14cbcSMatt Macy d2 = ctx->superscalar[3].v[1]; 127*eda14cbcSMatt Macy 128*eda14cbcSMatt Macy for (; ip < ipend; ip += 2) { 129*eda14cbcSMatt Macy a += BSWAP_32(ip[0]); 130*eda14cbcSMatt Macy a2 += BSWAP_32(ip[1]); 131*eda14cbcSMatt Macy b += a; 132*eda14cbcSMatt Macy b2 += a2; 133*eda14cbcSMatt Macy c += b; 134*eda14cbcSMatt Macy c2 += b2; 135*eda14cbcSMatt Macy d += c; 136*eda14cbcSMatt Macy d2 += c2; 137*eda14cbcSMatt Macy } 138*eda14cbcSMatt Macy 139*eda14cbcSMatt Macy ctx->superscalar[0].v[0] = a; 140*eda14cbcSMatt Macy ctx->superscalar[1].v[0] = b; 141*eda14cbcSMatt Macy ctx->superscalar[2].v[0] = c; 142*eda14cbcSMatt Macy ctx->superscalar[3].v[0] = d; 143*eda14cbcSMatt Macy ctx->superscalar[0].v[1] = a2; 144*eda14cbcSMatt Macy ctx->superscalar[1].v[1] = b2; 145*eda14cbcSMatt Macy ctx->superscalar[2].v[1] = c2; 146*eda14cbcSMatt Macy ctx->superscalar[3].v[1] = d2; 147*eda14cbcSMatt Macy } 148*eda14cbcSMatt Macy 149*eda14cbcSMatt Macy static boolean_t fletcher_4_superscalar_valid(void) 150*eda14cbcSMatt Macy { 151*eda14cbcSMatt Macy return (B_TRUE); 152*eda14cbcSMatt Macy } 153*eda14cbcSMatt Macy 154*eda14cbcSMatt Macy const fletcher_4_ops_t fletcher_4_superscalar_ops = { 155*eda14cbcSMatt Macy .init_native = fletcher_4_superscalar_init, 156*eda14cbcSMatt Macy .compute_native = fletcher_4_superscalar_native, 157*eda14cbcSMatt Macy .fini_native = fletcher_4_superscalar_fini, 158*eda14cbcSMatt Macy .init_byteswap = fletcher_4_superscalar_init, 159*eda14cbcSMatt Macy .compute_byteswap = fletcher_4_superscalar_byteswap, 160*eda14cbcSMatt Macy .fini_byteswap = fletcher_4_superscalar_fini, 161*eda14cbcSMatt Macy .valid = fletcher_4_superscalar_valid, 162*eda14cbcSMatt Macy .name = "superscalar" 163*eda14cbcSMatt Macy }; 164