199a2dd95SBruce Richardson /* SPDX-License-Identifier: BSD-3-Clause 299a2dd95SBruce Richardson * Copyright(c) 2020 Intel Corporation 399a2dd95SBruce Richardson */ 499a2dd95SBruce Richardson 599a2dd95SBruce Richardson #include <rte_vect.h> 699a2dd95SBruce Richardson #include <rte_fib.h> 799a2dd95SBruce Richardson 899a2dd95SBruce Richardson #include "dir24_8.h" 999a2dd95SBruce Richardson #include "dir24_8_avx512.h" 1099a2dd95SBruce Richardson 1199a2dd95SBruce Richardson static __rte_always_inline void 1299a2dd95SBruce Richardson dir24_8_vec_lookup_x16(void *p, const uint32_t *ips, 13*e194f3cdSVladimir Medvedkin uint64_t *next_hops, int size, bool be_addr) 1499a2dd95SBruce Richardson { 1599a2dd95SBruce Richardson struct dir24_8_tbl *dp = (struct dir24_8_tbl *)p; 1699a2dd95SBruce Richardson __mmask16 msk_ext; 1799a2dd95SBruce Richardson __mmask16 exp_msk = 0x5555; 1899a2dd95SBruce Richardson __m512i ip_vec, idxes, res, bytes; 1999a2dd95SBruce Richardson const __m512i zero = _mm512_set1_epi32(0); 2099a2dd95SBruce Richardson const __m512i lsb = _mm512_set1_epi32(1); 2199a2dd95SBruce Richardson const __m512i lsbyte_msk = _mm512_set1_epi32(0xff); 2299a2dd95SBruce Richardson __m512i tmp1, tmp2, res_msk; 2399a2dd95SBruce Richardson __m256i tmp256; 2499a2dd95SBruce Richardson /* used to mask gather values if size is 1/2 (8/16 bit next hops) */ 2599a2dd95SBruce Richardson if (size == sizeof(uint8_t)) 2699a2dd95SBruce Richardson res_msk = _mm512_set1_epi32(UINT8_MAX); 2799a2dd95SBruce Richardson else if (size == sizeof(uint16_t)) 2899a2dd95SBruce Richardson res_msk = _mm512_set1_epi32(UINT16_MAX); 2999a2dd95SBruce Richardson 3099a2dd95SBruce Richardson ip_vec = _mm512_loadu_si512(ips); 31*e194f3cdSVladimir Medvedkin if (be_addr) { 32*e194f3cdSVladimir Medvedkin const __m512i bswap32 = _mm512_set_epi32( 33*e194f3cdSVladimir Medvedkin 0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203, 34*e194f3cdSVladimir Medvedkin 0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203, 35*e194f3cdSVladimir Medvedkin 0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203, 36*e194f3cdSVladimir Medvedkin 0x0c0d0e0f, 0x08090a0b, 0x04050607, 0x00010203 37*e194f3cdSVladimir Medvedkin ); 38*e194f3cdSVladimir Medvedkin ip_vec = _mm512_shuffle_epi8(ip_vec, bswap32); 39*e194f3cdSVladimir Medvedkin } 40*e194f3cdSVladimir Medvedkin 4199a2dd95SBruce Richardson /* mask 24 most significant bits */ 4299a2dd95SBruce Richardson idxes = _mm512_srli_epi32(ip_vec, 8); 4399a2dd95SBruce Richardson 4499a2dd95SBruce Richardson /** 4599a2dd95SBruce Richardson * lookup in tbl24 4699a2dd95SBruce Richardson * Put it inside branch to make compiler happy with -O0 4799a2dd95SBruce Richardson */ 4899a2dd95SBruce Richardson if (size == sizeof(uint8_t)) { 4999a2dd95SBruce Richardson res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 1); 5099a2dd95SBruce Richardson res = _mm512_and_epi32(res, res_msk); 5199a2dd95SBruce Richardson } else if (size == sizeof(uint16_t)) { 5299a2dd95SBruce Richardson res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 2); 5399a2dd95SBruce Richardson res = _mm512_and_epi32(res, res_msk); 5499a2dd95SBruce Richardson } else 5599a2dd95SBruce Richardson res = _mm512_i32gather_epi32(idxes, (const int *)dp->tbl24, 4); 5699a2dd95SBruce Richardson 5799a2dd95SBruce Richardson /* get extended entries indexes */ 5899a2dd95SBruce Richardson msk_ext = _mm512_test_epi32_mask(res, lsb); 5999a2dd95SBruce Richardson 6099a2dd95SBruce Richardson if (msk_ext != 0) { 6199a2dd95SBruce Richardson idxes = _mm512_srli_epi32(res, 1); 6299a2dd95SBruce Richardson idxes = _mm512_slli_epi32(idxes, 8); 6399a2dd95SBruce Richardson bytes = _mm512_and_epi32(ip_vec, lsbyte_msk); 6499a2dd95SBruce Richardson idxes = _mm512_maskz_add_epi32(msk_ext, idxes, bytes); 6599a2dd95SBruce Richardson if (size == sizeof(uint8_t)) { 6699a2dd95SBruce Richardson idxes = _mm512_mask_i32gather_epi32(zero, msk_ext, 6799a2dd95SBruce Richardson idxes, (const int *)dp->tbl8, 1); 6899a2dd95SBruce Richardson idxes = _mm512_and_epi32(idxes, res_msk); 6999a2dd95SBruce Richardson } else if (size == sizeof(uint16_t)) { 7099a2dd95SBruce Richardson idxes = _mm512_mask_i32gather_epi32(zero, msk_ext, 7199a2dd95SBruce Richardson idxes, (const int *)dp->tbl8, 2); 7299a2dd95SBruce Richardson idxes = _mm512_and_epi32(idxes, res_msk); 7399a2dd95SBruce Richardson } else 7499a2dd95SBruce Richardson idxes = _mm512_mask_i32gather_epi32(zero, msk_ext, 7599a2dd95SBruce Richardson idxes, (const int *)dp->tbl8, 4); 7699a2dd95SBruce Richardson 7799a2dd95SBruce Richardson res = _mm512_mask_blend_epi32(msk_ext, res, idxes); 7899a2dd95SBruce Richardson } 7999a2dd95SBruce Richardson 8099a2dd95SBruce Richardson res = _mm512_srli_epi32(res, 1); 8199a2dd95SBruce Richardson tmp1 = _mm512_maskz_expand_epi32(exp_msk, res); 8299a2dd95SBruce Richardson tmp256 = _mm512_extracti32x8_epi32(res, 1); 8399a2dd95SBruce Richardson tmp2 = _mm512_maskz_expand_epi32(exp_msk, 8499a2dd95SBruce Richardson _mm512_castsi256_si512(tmp256)); 8599a2dd95SBruce Richardson _mm512_storeu_si512(next_hops, tmp1); 8699a2dd95SBruce Richardson _mm512_storeu_si512(next_hops + 8, tmp2); 8799a2dd95SBruce Richardson } 8899a2dd95SBruce Richardson 8999a2dd95SBruce Richardson static __rte_always_inline void 9099a2dd95SBruce Richardson dir24_8_vec_lookup_x8_8b(void *p, const uint32_t *ips, 91*e194f3cdSVladimir Medvedkin uint64_t *next_hops, bool be_addr) 9299a2dd95SBruce Richardson { 9399a2dd95SBruce Richardson struct dir24_8_tbl *dp = (struct dir24_8_tbl *)p; 9499a2dd95SBruce Richardson const __m512i zero = _mm512_set1_epi32(0); 9599a2dd95SBruce Richardson const __m512i lsbyte_msk = _mm512_set1_epi64(0xff); 9699a2dd95SBruce Richardson const __m512i lsb = _mm512_set1_epi64(1); 9799a2dd95SBruce Richardson __m512i res, idxes, bytes; 9899a2dd95SBruce Richardson __m256i idxes_256, ip_vec; 9999a2dd95SBruce Richardson __mmask8 msk_ext; 10099a2dd95SBruce Richardson 10199a2dd95SBruce Richardson ip_vec = _mm256_loadu_si256((const void *)ips); 102*e194f3cdSVladimir Medvedkin if (be_addr) { 103*e194f3cdSVladimir Medvedkin const __m256i bswap32 = _mm256_set_epi8( 104*e194f3cdSVladimir Medvedkin 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 105*e194f3cdSVladimir Medvedkin 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 106*e194f3cdSVladimir Medvedkin ); 107*e194f3cdSVladimir Medvedkin ip_vec = _mm256_shuffle_epi8(ip_vec, bswap32); 108*e194f3cdSVladimir Medvedkin } 10999a2dd95SBruce Richardson /* mask 24 most significant bits */ 11099a2dd95SBruce Richardson idxes_256 = _mm256_srli_epi32(ip_vec, 8); 11199a2dd95SBruce Richardson 11299a2dd95SBruce Richardson /* lookup in tbl24 */ 11399a2dd95SBruce Richardson res = _mm512_i32gather_epi64(idxes_256, (const void *)dp->tbl24, 8); 11499a2dd95SBruce Richardson 11599a2dd95SBruce Richardson /* get extended entries indexes */ 11699a2dd95SBruce Richardson msk_ext = _mm512_test_epi64_mask(res, lsb); 11799a2dd95SBruce Richardson 11899a2dd95SBruce Richardson if (msk_ext != 0) { 11999a2dd95SBruce Richardson bytes = _mm512_cvtepi32_epi64(ip_vec); 12099a2dd95SBruce Richardson idxes = _mm512_srli_epi64(res, 1); 12199a2dd95SBruce Richardson idxes = _mm512_slli_epi64(idxes, 8); 12299a2dd95SBruce Richardson bytes = _mm512_and_epi64(bytes, lsbyte_msk); 12399a2dd95SBruce Richardson idxes = _mm512_maskz_add_epi64(msk_ext, idxes, bytes); 12499a2dd95SBruce Richardson idxes = _mm512_mask_i64gather_epi64(zero, msk_ext, idxes, 12599a2dd95SBruce Richardson (const void *)dp->tbl8, 8); 12699a2dd95SBruce Richardson 12799a2dd95SBruce Richardson res = _mm512_mask_blend_epi64(msk_ext, res, idxes); 12899a2dd95SBruce Richardson } 12999a2dd95SBruce Richardson 13099a2dd95SBruce Richardson res = _mm512_srli_epi64(res, 1); 13199a2dd95SBruce Richardson _mm512_storeu_si512(next_hops, res); 13299a2dd95SBruce Richardson } 13399a2dd95SBruce Richardson 134*e194f3cdSVladimir Medvedkin #define DECLARE_VECTOR_FN(suffix, nh_type, be_addr) \ 135*e194f3cdSVladimir Medvedkin void \ 136*e194f3cdSVladimir Medvedkin rte_dir24_8_vec_lookup_bulk_##suffix(void *p, const uint32_t *ips, uint64_t *next_hops, \ 137*e194f3cdSVladimir Medvedkin const unsigned int n) \ 138*e194f3cdSVladimir Medvedkin { \ 139*e194f3cdSVladimir Medvedkin uint32_t i; \ 140*e194f3cdSVladimir Medvedkin for (i = 0; i < (n / 16); i++) \ 141*e194f3cdSVladimir Medvedkin dir24_8_vec_lookup_x16(p, ips + i * 16, next_hops + i * 16, sizeof(nh_type), \ 142*e194f3cdSVladimir Medvedkin be_addr); \ 143*e194f3cdSVladimir Medvedkin dir24_8_lookup_bulk_##suffix(p, ips + i * 16, next_hops + i * 16, n - i * 16); \ 14499a2dd95SBruce Richardson } 14599a2dd95SBruce Richardson 146*e194f3cdSVladimir Medvedkin DECLARE_VECTOR_FN(1b, uint8_t, false) 147*e194f3cdSVladimir Medvedkin DECLARE_VECTOR_FN(1b_be, uint8_t, true) 148*e194f3cdSVladimir Medvedkin DECLARE_VECTOR_FN(2b, uint16_t, false) 149*e194f3cdSVladimir Medvedkin DECLARE_VECTOR_FN(2b_be, uint16_t, true) 150*e194f3cdSVladimir Medvedkin DECLARE_VECTOR_FN(4b, uint32_t, false) 151*e194f3cdSVladimir Medvedkin DECLARE_VECTOR_FN(4b_be, uint32_t, true) 15299a2dd95SBruce Richardson 15399a2dd95SBruce Richardson void 15499a2dd95SBruce Richardson rte_dir24_8_vec_lookup_bulk_8b(void *p, const uint32_t *ips, 15599a2dd95SBruce Richardson uint64_t *next_hops, const unsigned int n) 15699a2dd95SBruce Richardson { 15799a2dd95SBruce Richardson uint32_t i; 15899a2dd95SBruce Richardson for (i = 0; i < (n / 8); i++) 159*e194f3cdSVladimir Medvedkin dir24_8_vec_lookup_x8_8b(p, ips + i * 8, next_hops + i * 8, false); 16099a2dd95SBruce Richardson dir24_8_lookup_bulk_8b(p, ips + i * 8, next_hops + i * 8, n - i * 8); 16199a2dd95SBruce Richardson } 162*e194f3cdSVladimir Medvedkin 163*e194f3cdSVladimir Medvedkin void 164*e194f3cdSVladimir Medvedkin rte_dir24_8_vec_lookup_bulk_8b_be(void *p, const uint32_t *ips, 165*e194f3cdSVladimir Medvedkin uint64_t *next_hops, const unsigned int n) 166*e194f3cdSVladimir Medvedkin { 167*e194f3cdSVladimir Medvedkin uint32_t i; 168*e194f3cdSVladimir Medvedkin for (i = 0; i < (n / 8); i++) 169*e194f3cdSVladimir Medvedkin dir24_8_vec_lookup_x8_8b(p, ips + i * 8, next_hops + i * 8, true); 170*e194f3cdSVladimir Medvedkin dir24_8_lookup_bulk_8b_be(p, ips + i * 8, next_hops + i * 8, n - i * 8); 171*e194f3cdSVladimir Medvedkin } 172