1eda14cbcSMatt Macy /* 2eda14cbcSMatt Macy * CDDL HEADER START 3eda14cbcSMatt Macy * 4eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7eda14cbcSMatt Macy * 8eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9eda14cbcSMatt Macy * or http://www.opensolaris.org/os/licensing. 10eda14cbcSMatt Macy * See the License for the specific language governing permissions 11eda14cbcSMatt Macy * and limitations under the License. 12eda14cbcSMatt Macy * 13eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18eda14cbcSMatt Macy * 19eda14cbcSMatt Macy * CDDL HEADER END 20eda14cbcSMatt Macy */ 21eda14cbcSMatt Macy /* 22eda14cbcSMatt Macy * Copyright (C) 2016 Gvozden Nešković. All rights reserved. 23eda14cbcSMatt Macy */ 24eda14cbcSMatt Macy 25eda14cbcSMatt Macy #if defined(__x86_64) && defined(HAVE_AVX512F) 26eda14cbcSMatt Macy 27eda14cbcSMatt Macy #include <sys/byteorder.h> 28eda14cbcSMatt Macy #include <sys/frame.h> 29eda14cbcSMatt Macy #include <sys/spa_checksum.h> 30*da5137abSMartin Matuska #include <sys/string.h> 31eda14cbcSMatt Macy #include <sys/simd.h> 32eda14cbcSMatt Macy #include <zfs_fletcher.h> 33eda14cbcSMatt Macy 34eda14cbcSMatt Macy #ifdef __linux__ 35eda14cbcSMatt Macy #define __asm __asm__ __volatile__ 36eda14cbcSMatt Macy #endif 37eda14cbcSMatt Macy 38c03c5b1cSMartin Matuska ZFS_NO_SANITIZE_UNDEFINED 39eda14cbcSMatt Macy static void 40eda14cbcSMatt Macy fletcher_4_avx512f_init(fletcher_4_ctx_t *ctx) 41eda14cbcSMatt Macy { 42*da5137abSMartin Matuska memset(ctx->avx512, 0, 4 * sizeof (zfs_fletcher_avx512_t)); 43eda14cbcSMatt Macy } 44eda14cbcSMatt Macy 45c03c5b1cSMartin Matuska ZFS_NO_SANITIZE_UNDEFINED 46eda14cbcSMatt Macy static void 47eda14cbcSMatt Macy fletcher_4_avx512f_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) 48eda14cbcSMatt Macy { 49eda14cbcSMatt Macy static const uint64_t 50eda14cbcSMatt Macy CcA[] = { 0, 0, 1, 3, 6, 10, 15, 21 }, 51eda14cbcSMatt Macy CcB[] = { 28, 36, 44, 52, 60, 68, 76, 84 }, 52eda14cbcSMatt Macy DcA[] = { 0, 0, 0, 1, 4, 10, 20, 35 }, 53eda14cbcSMatt Macy DcB[] = { 56, 84, 120, 164, 216, 276, 344, 420 }, 54eda14cbcSMatt Macy DcC[] = { 448, 512, 576, 640, 704, 768, 832, 896 }; 55eda14cbcSMatt Macy 56eda14cbcSMatt Macy uint64_t A, B, C, D; 57eda14cbcSMatt Macy uint64_t i; 58eda14cbcSMatt Macy 59eda14cbcSMatt Macy A = ctx->avx512[0].v[0]; 60eda14cbcSMatt Macy B = 8 * ctx->avx512[1].v[0]; 61eda14cbcSMatt Macy C = 64 * ctx->avx512[2].v[0] - CcB[0] * ctx->avx512[1].v[0]; 62eda14cbcSMatt Macy D = 512 * ctx->avx512[3].v[0] - DcC[0] * ctx->avx512[2].v[0] + 63eda14cbcSMatt Macy DcB[0] * ctx->avx512[1].v[0]; 64eda14cbcSMatt Macy 65eda14cbcSMatt Macy for (i = 1; i < 8; i++) { 66eda14cbcSMatt Macy A += ctx->avx512[0].v[i]; 67eda14cbcSMatt Macy B += 8 * ctx->avx512[1].v[i] - i * ctx->avx512[0].v[i]; 68eda14cbcSMatt Macy C += 64 * ctx->avx512[2].v[i] - CcB[i] * ctx->avx512[1].v[i] + 69eda14cbcSMatt Macy CcA[i] * ctx->avx512[0].v[i]; 70eda14cbcSMatt Macy D += 512 * ctx->avx512[3].v[i] - DcC[i] * ctx->avx512[2].v[i] + 71eda14cbcSMatt Macy DcB[i] * ctx->avx512[1].v[i] - DcA[i] * ctx->avx512[0].v[i]; 72eda14cbcSMatt Macy } 73eda14cbcSMatt Macy 74eda14cbcSMatt Macy ZIO_SET_CHECKSUM(zcp, A, B, C, D); 75eda14cbcSMatt Macy } 76eda14cbcSMatt Macy 77eda14cbcSMatt Macy #define FLETCHER_4_AVX512_RESTORE_CTX(ctx) \ 78eda14cbcSMatt Macy { \ 79eda14cbcSMatt Macy __asm("vmovdqu64 %0, %%zmm0" :: "m" ((ctx)->avx512[0])); \ 80eda14cbcSMatt Macy __asm("vmovdqu64 %0, %%zmm1" :: "m" ((ctx)->avx512[1])); \ 81eda14cbcSMatt Macy __asm("vmovdqu64 %0, %%zmm2" :: "m" ((ctx)->avx512[2])); \ 82eda14cbcSMatt Macy __asm("vmovdqu64 %0, %%zmm3" :: "m" ((ctx)->avx512[3])); \ 83eda14cbcSMatt Macy } 84eda14cbcSMatt Macy 85eda14cbcSMatt Macy #define FLETCHER_4_AVX512_SAVE_CTX(ctx) \ 86eda14cbcSMatt Macy { \ 87eda14cbcSMatt Macy __asm("vmovdqu64 %%zmm0, %0" : "=m" ((ctx)->avx512[0])); \ 88eda14cbcSMatt Macy __asm("vmovdqu64 %%zmm1, %0" : "=m" ((ctx)->avx512[1])); \ 89eda14cbcSMatt Macy __asm("vmovdqu64 %%zmm2, %0" : "=m" ((ctx)->avx512[2])); \ 90eda14cbcSMatt Macy __asm("vmovdqu64 %%zmm3, %0" : "=m" ((ctx)->avx512[3])); \ 91eda14cbcSMatt Macy } 92eda14cbcSMatt Macy 93eda14cbcSMatt Macy static void 94eda14cbcSMatt Macy fletcher_4_avx512f_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) 95eda14cbcSMatt Macy { 96eda14cbcSMatt Macy const uint32_t *ip = buf; 97eda14cbcSMatt Macy const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size); 98eda14cbcSMatt Macy 99eda14cbcSMatt Macy kfpu_begin(); 100eda14cbcSMatt Macy 101eda14cbcSMatt Macy FLETCHER_4_AVX512_RESTORE_CTX(ctx); 102eda14cbcSMatt Macy 103eda14cbcSMatt Macy for (; ip < ipend; ip += 8) { 104eda14cbcSMatt Macy __asm("vpmovzxdq %0, %%zmm4"::"m" (*ip)); 105eda14cbcSMatt Macy __asm("vpaddq %zmm4, %zmm0, %zmm0"); 106eda14cbcSMatt Macy __asm("vpaddq %zmm0, %zmm1, %zmm1"); 107eda14cbcSMatt Macy __asm("vpaddq %zmm1, %zmm2, %zmm2"); 108eda14cbcSMatt Macy __asm("vpaddq %zmm2, %zmm3, %zmm3"); 109eda14cbcSMatt Macy } 110eda14cbcSMatt Macy 111eda14cbcSMatt Macy FLETCHER_4_AVX512_SAVE_CTX(ctx); 112eda14cbcSMatt Macy 113eda14cbcSMatt Macy kfpu_end(); 114eda14cbcSMatt Macy } 115eda14cbcSMatt Macy STACK_FRAME_NON_STANDARD(fletcher_4_avx512f_native); 116eda14cbcSMatt Macy 117eda14cbcSMatt Macy static void 118eda14cbcSMatt Macy fletcher_4_avx512f_byteswap(fletcher_4_ctx_t *ctx, const void *buf, 119eda14cbcSMatt Macy uint64_t size) 120eda14cbcSMatt Macy { 121eda14cbcSMatt Macy static const uint64_t byteswap_mask = 0xFFULL; 122eda14cbcSMatt Macy const uint32_t *ip = buf; 123eda14cbcSMatt Macy const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size); 124eda14cbcSMatt Macy 125eda14cbcSMatt Macy kfpu_begin(); 126eda14cbcSMatt Macy 127eda14cbcSMatt Macy FLETCHER_4_AVX512_RESTORE_CTX(ctx); 128eda14cbcSMatt Macy 129eda14cbcSMatt Macy __asm("vpbroadcastq %0, %%zmm8" :: "r" (byteswap_mask)); 130eda14cbcSMatt Macy __asm("vpsllq $8, %zmm8, %zmm9"); 131eda14cbcSMatt Macy __asm("vpsllq $16, %zmm8, %zmm10"); 132eda14cbcSMatt Macy __asm("vpsllq $24, %zmm8, %zmm11"); 133eda14cbcSMatt Macy 134eda14cbcSMatt Macy for (; ip < ipend; ip += 8) { 135eda14cbcSMatt Macy __asm("vpmovzxdq %0, %%zmm5"::"m" (*ip)); 136eda14cbcSMatt Macy 137eda14cbcSMatt Macy __asm("vpsrlq $24, %zmm5, %zmm6"); 138eda14cbcSMatt Macy __asm("vpandd %zmm8, %zmm6, %zmm6"); 139eda14cbcSMatt Macy __asm("vpsrlq $8, %zmm5, %zmm7"); 140eda14cbcSMatt Macy __asm("vpandd %zmm9, %zmm7, %zmm7"); 141eda14cbcSMatt Macy __asm("vpord %zmm6, %zmm7, %zmm4"); 142eda14cbcSMatt Macy __asm("vpsllq $8, %zmm5, %zmm6"); 143eda14cbcSMatt Macy __asm("vpandd %zmm10, %zmm6, %zmm6"); 144eda14cbcSMatt Macy __asm("vpord %zmm6, %zmm4, %zmm4"); 145eda14cbcSMatt Macy __asm("vpsllq $24, %zmm5, %zmm5"); 146eda14cbcSMatt Macy __asm("vpandd %zmm11, %zmm5, %zmm5"); 147eda14cbcSMatt Macy __asm("vpord %zmm5, %zmm4, %zmm4"); 148eda14cbcSMatt Macy 149eda14cbcSMatt Macy __asm("vpaddq %zmm4, %zmm0, %zmm0"); 150eda14cbcSMatt Macy __asm("vpaddq %zmm0, %zmm1, %zmm1"); 151eda14cbcSMatt Macy __asm("vpaddq %zmm1, %zmm2, %zmm2"); 152eda14cbcSMatt Macy __asm("vpaddq %zmm2, %zmm3, %zmm3"); 153eda14cbcSMatt Macy } 154eda14cbcSMatt Macy 155eda14cbcSMatt Macy FLETCHER_4_AVX512_SAVE_CTX(ctx) 156eda14cbcSMatt Macy 157eda14cbcSMatt Macy kfpu_end(); 158eda14cbcSMatt Macy } 159eda14cbcSMatt Macy STACK_FRAME_NON_STANDARD(fletcher_4_avx512f_byteswap); 160eda14cbcSMatt Macy 161eda14cbcSMatt Macy static boolean_t 162eda14cbcSMatt Macy fletcher_4_avx512f_valid(void) 163eda14cbcSMatt Macy { 164eda14cbcSMatt Macy return (kfpu_allowed() && zfs_avx512f_available()); 165eda14cbcSMatt Macy } 166eda14cbcSMatt Macy 167eda14cbcSMatt Macy const fletcher_4_ops_t fletcher_4_avx512f_ops = { 168eda14cbcSMatt Macy .init_native = fletcher_4_avx512f_init, 169eda14cbcSMatt Macy .fini_native = fletcher_4_avx512f_fini, 170eda14cbcSMatt Macy .compute_native = fletcher_4_avx512f_native, 171eda14cbcSMatt Macy .init_byteswap = fletcher_4_avx512f_init, 172eda14cbcSMatt Macy .fini_byteswap = fletcher_4_avx512f_fini, 173eda14cbcSMatt Macy .compute_byteswap = fletcher_4_avx512f_byteswap, 174eda14cbcSMatt Macy .valid = fletcher_4_avx512f_valid, 175eda14cbcSMatt Macy .name = "avx512f" 176eda14cbcSMatt Macy }; 177eda14cbcSMatt Macy 178eda14cbcSMatt Macy #if defined(HAVE_AVX512BW) 179eda14cbcSMatt Macy static void 180eda14cbcSMatt Macy fletcher_4_avx512bw_byteswap(fletcher_4_ctx_t *ctx, const void *buf, 181eda14cbcSMatt Macy uint64_t size) 182eda14cbcSMatt Macy { 183eda14cbcSMatt Macy static const zfs_fletcher_avx512_t mask = { 184eda14cbcSMatt Macy .v = { 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B, 185eda14cbcSMatt Macy 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B, 186eda14cbcSMatt Macy 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B, 187eda14cbcSMatt Macy 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B } 188eda14cbcSMatt Macy }; 189eda14cbcSMatt Macy const uint32_t *ip = buf; 190eda14cbcSMatt Macy const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size); 191eda14cbcSMatt Macy 192eda14cbcSMatt Macy kfpu_begin(); 193eda14cbcSMatt Macy 194eda14cbcSMatt Macy FLETCHER_4_AVX512_RESTORE_CTX(ctx); 195eda14cbcSMatt Macy 196eda14cbcSMatt Macy __asm("vmovdqu64 %0, %%zmm5" :: "m" (mask)); 197eda14cbcSMatt Macy 198eda14cbcSMatt Macy for (; ip < ipend; ip += 8) { 199eda14cbcSMatt Macy __asm("vpmovzxdq %0, %%zmm4"::"m" (*ip)); 200eda14cbcSMatt Macy 201eda14cbcSMatt Macy __asm("vpshufb %zmm5, %zmm4, %zmm4"); 202eda14cbcSMatt Macy 203eda14cbcSMatt Macy __asm("vpaddq %zmm4, %zmm0, %zmm0"); 204eda14cbcSMatt Macy __asm("vpaddq %zmm0, %zmm1, %zmm1"); 205eda14cbcSMatt Macy __asm("vpaddq %zmm1, %zmm2, %zmm2"); 206eda14cbcSMatt Macy __asm("vpaddq %zmm2, %zmm3, %zmm3"); 207eda14cbcSMatt Macy } 208eda14cbcSMatt Macy 209eda14cbcSMatt Macy FLETCHER_4_AVX512_SAVE_CTX(ctx) 210eda14cbcSMatt Macy 211eda14cbcSMatt Macy kfpu_end(); 212eda14cbcSMatt Macy } 213eda14cbcSMatt Macy STACK_FRAME_NON_STANDARD(fletcher_4_avx512bw_byteswap); 214eda14cbcSMatt Macy 21516038816SMartin Matuska static boolean_t 21616038816SMartin Matuska fletcher_4_avx512bw_valid(void) 21716038816SMartin Matuska { 21816038816SMartin Matuska return (fletcher_4_avx512f_valid() && zfs_avx512bw_available()); 21916038816SMartin Matuska } 22016038816SMartin Matuska 221eda14cbcSMatt Macy const fletcher_4_ops_t fletcher_4_avx512bw_ops = { 222eda14cbcSMatt Macy .init_native = fletcher_4_avx512f_init, 223eda14cbcSMatt Macy .fini_native = fletcher_4_avx512f_fini, 224eda14cbcSMatt Macy .compute_native = fletcher_4_avx512f_native, 225eda14cbcSMatt Macy .init_byteswap = fletcher_4_avx512f_init, 226eda14cbcSMatt Macy .fini_byteswap = fletcher_4_avx512f_fini, 227eda14cbcSMatt Macy .compute_byteswap = fletcher_4_avx512bw_byteswap, 22816038816SMartin Matuska .valid = fletcher_4_avx512bw_valid, 229eda14cbcSMatt Macy .name = "avx512bw" 230eda14cbcSMatt Macy }; 231eda14cbcSMatt Macy #endif 232eda14cbcSMatt Macy 233eda14cbcSMatt Macy #endif /* defined(__x86_64) && defined(HAVE_AVX512F) */ 234