1 /* SPDX-License-Identifier: BSD-3-Clause 2 * Copyright(c) 2020 Intel Corporation 3 */ 4 5 #include <rte_vect.h> 6 #include <rte_fib.h> 7 8 #include "dir24_8.h" 9 #include "dir24_8_avx512.h" 10 11 static __rte_always_inline void 12 dir24_8_vec_lookup_x16(void *p, const uint32_t *ips, 13 uint64_t *next_hops, int size, bool be_addr) 14 { 15 struct dir24_8_tbl *dp = (struct dir24_8_tbl *)p; 16 __mmask16 msk_ext; 17 __mmask16 exp_msk = 0x5555; 18 __m512i ip_vec, idxes, res, bytes; 19 const __m512i zero = _mm512_set1_epi32(0); 20 const __m512i lsb = _mm512_set1_epi32(1); 21 const __m512i lsbyte_msk = _mm512_set1_epi32(0xff); 22 __m512i tmp1, tmp2, res_msk; 23 __m256i tmp256; 24 /* used to mask gather values if size is 1/2 (8/16 bit next hops) */ 25 if (size == sizeof(uint8_t)) 26 res_msk = _mm512_set1_epi32(UINT8_MAX); 27 else if (size == sizeof(uint16_t)) 28 res_msk = _mm512_set1_epi32(UINT16_MAX); 29 30 ip_vec = _mm512_loadu_si512(ips); 31 if (be_addr) { 32 const __m512i bswap32 = _mm512_set_epi32( 33 0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203, 34 0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203, 35 0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203, 36 0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203 37 ); 38 ip_vec = _mm512_shuffle_epi8(ip_vec, bswap32); 39 } 40 41 /* mask 24 most significant bits */ 42 idxes = _mm512_srli_epi32(ip_vec, 8); 43 44 /** 45 * lookup in tbl24 46 * Put it inside branch to make compiler happy with -O0 47 */ 48 if (size == sizeof(uint8_t)) { 49 res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 1); 50 res = _mm512_and_epi32(res, res_msk); 51 } else if (size == sizeof(uint16_t)) { 52 res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 2); 53 res = _mm512_and_epi32(res, res_msk); 54 } else 55 res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 4); 56 57 /* get extended entries indexes */ 58 msk_ext = _mm512_test_epi32_mask(res, lsb); 59 60 if (msk_ext != 0) { 61 idxes = _mm512_srli_epi32(res, 1); 62 idxes = _mm512_slli_epi32(idxes, 8); 63 bytes = _mm512_and_epi32(ip_vec, lsbyte_msk); 64 idxes = _mm512_maskz_add_epi32(msk_ext, idxes, bytes); 65 if (size == sizeof(uint8_t)) { 66 idxes = _mm512_mask_i32gather_epi32(zero, msk_ext, 67 idxes, (const int *)dp->tbl8, 1); 68 idxes = _mm512_and_epi32(idxes, res_msk); 69 } else if (size == sizeof(uint16_t)) { 70 idxes = _mm512_mask_i32gather_epi32(zero, msk_ext, 71 idxes, (const int *)dp->tbl8, 2); 72 idxes = _mm512_and_epi32(idxes, res_msk); 73 } else 74 idxes = _mm512_mask_i32gather_epi32(zero, msk_ext, 75 idxes, (const int *)dp->tbl8, 4); 76 77 res = _mm512_mask_blend_epi32(msk_ext, res, idxes); 78 } 79 80 res = _mm512_srli_epi32(res, 1); 81 tmp1 = _mm512_maskz_expand_epi32(exp_msk, res); 82 tmp256 = _mm512_extracti32x8_epi32(res, 1); 83 tmp2 = _mm512_maskz_expand_epi32(exp_msk, 84 _mm512_castsi256_si512(tmp256)); 85 _mm512_storeu_si512(next_hops, tmp1); 86 _mm512_storeu_si512(next_hops + 8, tmp2); 87 } 88 89 static __rte_always_inline void 90 dir24_8_vec_lookup_x8_8b(void *p, const uint32_t *ips, 91 uint64_t *next_hops, bool be_addr) 92 { 93 struct dir24_8_tbl *dp = (struct dir24_8_tbl *)p; 94 const __m512i zero = _mm512_set1_epi32(0); 95 const __m512i lsbyte_msk = _mm512_set1_epi64(0xff); 96 const __m512i lsb = _mm512_set1_epi64(1); 97 __m512i res, idxes, bytes; 98 __m256i idxes_256, ip_vec; 99 __mmask8 msk_ext; 100 101 ip_vec = _mm256_loadu_si256((const void *)ips); 102 if (be_addr) { 103 const __m256i bswap32 = _mm256_set_epi8( 104 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 105 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 106 ); 107 ip_vec = _mm256_shuffle_epi8(ip_vec, bswap32); 108 } 109 /* mask 24 most significant bits */ 110 idxes_256 = _mm256_srli_epi32(ip_vec, 8); 111 112 /* lookup in tbl24 */ 113 res = _mm512_i32gather_epi64(idxes_256, (const void *)dp->tbl24, 8); 114 115 /* get extended entries indexes */ 116 msk_ext = _mm512_test_epi64_mask(res, lsb); 117 118 if (msk_ext != 0) { 119 bytes = _mm512_cvtepi32_epi64(ip_vec); 120 idxes = _mm512_srli_epi64(res, 1); 121 idxes = _mm512_slli_epi64(idxes, 8); 122 bytes = _mm512_and_epi64(bytes, lsbyte_msk); 123 idxes = _mm512_maskz_add_epi64(msk_ext, idxes, bytes); 124 idxes = _mm512_mask_i64gather_epi64(zero, msk_ext, idxes, 125 (const void *)dp->tbl8, 8); 126 127 res = _mm512_mask_blend_epi64(msk_ext, res, idxes); 128 } 129 130 res = _mm512_srli_epi64(res, 1); 131 _mm512_storeu_si512(next_hops, res); 132 } 133 134 #define DECLARE_VECTOR_FN(suffix, nh_type, be_addr) \ 135 void \ 136 rte_dir24_8_vec_lookup_bulk_##suffix(void *p, const uint32_t *ips, uint64_t *next_hops, \ 137 const unsigned int n) \ 138 { \ 139 uint32_t i; \ 140 for (i = 0; i < (n / 16); i++) \ 141 dir24_8_vec_lookup_x16(p, ips + i * 16, next_hops + i * 16, sizeof(nh_type), \ 142 be_addr); \ 143 dir24_8_lookup_bulk_##suffix(p, ips + i * 16, next_hops + i * 16, n - i * 16); \ 144 } 145 146 DECLARE_VECTOR_FN(1b, uint8_t, false) 147 DECLARE_VECTOR_FN(1b_be, uint8_t, true) 148 DECLARE_VECTOR_FN(2b, uint16_t, false) 149 DECLARE_VECTOR_FN(2b_be, uint16_t, true) 150 DECLARE_VECTOR_FN(4b, uint32_t, false) 151 DECLARE_VECTOR_FN(4b_be, uint32_t, true) 152 153 void 154 rte_dir24_8_vec_lookup_bulk_8b(void *p, const uint32_t *ips, 155 uint64_t *next_hops, const unsigned int n) 156 { 157 uint32_t i; 158 for (i = 0; i < (n / 8); i++) 159 dir24_8_vec_lookup_x8_8b(p, ips + i * 8, next_hops + i * 8, false); 160 dir24_8_lookup_bulk_8b(p, ips + i * 8, next_hops + i * 8, n - i * 8); 161 } 162 163 void 164 rte_dir24_8_vec_lookup_bulk_8b_be(void *p, const uint32_t *ips, 165 uint64_t *next_hops, const unsigned int n) 166 { 167 uint32_t i; 168 for (i = 0; i < (n / 8); i++) 169 dir24_8_vec_lookup_x8_8b(p, ips + i * 8, next_hops + i * 8, true); 170 dir24_8_lookup_bulk_8b_be(p, ips + i * 8, next_hops + i * 8, n - i * 8); 171 } 172