16df3ef80SGreg Tucker;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 28dc5d913SGreg Tucker; Copyright(c) 2011-2019 Intel Corporation All rights reserved. 36df3ef80SGreg Tucker; 46df3ef80SGreg Tucker; Redistribution and use in source and binary forms, with or without 56df3ef80SGreg Tucker; modification, are permitted provided that the following conditions 66df3ef80SGreg Tucker; are met: 76df3ef80SGreg Tucker; * Redistributions of source code must retain the above copyright 86df3ef80SGreg Tucker; notice, this list of conditions and the following disclaimer. 96df3ef80SGreg Tucker; * Redistributions in binary form must reproduce the above copyright 106df3ef80SGreg Tucker; notice, this list of conditions and the following disclaimer in 116df3ef80SGreg Tucker; the documentation and/or other materials provided with the 126df3ef80SGreg Tucker; distribution. 136df3ef80SGreg Tucker; * Neither the name of Intel Corporation nor the names of its 146df3ef80SGreg Tucker; contributors may be used to endorse or promote products derived 156df3ef80SGreg Tucker; from this software without specific prior written permission. 166df3ef80SGreg Tucker; 176df3ef80SGreg Tucker; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 186df3ef80SGreg Tucker; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 196df3ef80SGreg Tucker; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 206df3ef80SGreg Tucker; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 216df3ef80SGreg Tucker; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 226df3ef80SGreg Tucker; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 236df3ef80SGreg Tucker; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 246df3ef80SGreg Tucker; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 256df3ef80SGreg Tucker; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 266df3ef80SGreg Tucker; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 276df3ef80SGreg Tucker; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 286df3ef80SGreg Tucker;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 296df3ef80SGreg Tucker 306df3ef80SGreg Tucker%ifndef _MULTIBINARY_ASM_ 316df3ef80SGreg Tucker%define _MULTIBINARY_ASM_ 326df3ef80SGreg Tucker 336df3ef80SGreg Tucker%ifidn __OUTPUT_FORMAT__, elf32 346df3ef80SGreg Tucker %define mbin_def_ptr dd 356df3ef80SGreg Tucker %define mbin_ptr_sz dword 366df3ef80SGreg Tucker %define mbin_rdi edi 376df3ef80SGreg Tucker %define mbin_rsi esi 386df3ef80SGreg Tucker %define mbin_rax eax 396df3ef80SGreg Tucker %define mbin_rbx ebx 406df3ef80SGreg Tucker %define mbin_rcx ecx 416df3ef80SGreg Tucker %define mbin_rdx edx 426df3ef80SGreg Tucker%else 436df3ef80SGreg Tucker %define mbin_def_ptr dq 446df3ef80SGreg Tucker %define mbin_ptr_sz qword 456df3ef80SGreg Tucker %define mbin_rdi rdi 466df3ef80SGreg Tucker %define mbin_rsi rsi 476df3ef80SGreg Tucker %define mbin_rax rax 486df3ef80SGreg Tucker %define mbin_rbx rbx 496df3ef80SGreg Tucker %define mbin_rcx rcx 506df3ef80SGreg Tucker %define mbin_rdx rdx 516df3ef80SGreg Tucker%endif 526df3ef80SGreg Tucker 538dc5d913SGreg Tucker%ifndef AS_FEATURE_LEVEL 548dc5d913SGreg Tucker%define AS_FEATURE_LEVEL 4 558dc5d913SGreg Tucker%endif 568dc5d913SGreg Tucker 576df3ef80SGreg Tucker;;;; 586df3ef80SGreg Tucker; multibinary macro: 59*86058544SPablo de Lara; creates the visible entry point that uses HW optimized call pointer 606df3ef80SGreg Tucker; creates the init of the HW optimized call pointer 616df3ef80SGreg Tucker;;;; 626df3ef80SGreg Tucker%macro mbin_interface 1 636df3ef80SGreg Tucker ;;;; 646df3ef80SGreg Tucker ; *_dispatched is defaulted to *_mbinit and replaced on first call. 656df3ef80SGreg Tucker ; Therefore, *_dispatch_init is only executed on first call. 666df3ef80SGreg Tucker ;;;; 676df3ef80SGreg Tucker section .data 686df3ef80SGreg Tucker %1_dispatched: 696df3ef80SGreg Tucker mbin_def_ptr %1_mbinit 706df3ef80SGreg Tucker 716df3ef80SGreg Tucker section .text 720e4f088aSGreg Tucker mk_global %1, function 736df3ef80SGreg Tucker %1_mbinit: 746df3ef80SGreg Tucker ;;; only called the first time to setup hardware match 756df3ef80SGreg Tucker call %1_dispatch_init 766df3ef80SGreg Tucker ;;; falls thru to execute the hw optimized code 776df3ef80SGreg Tucker %1: 786df3ef80SGreg Tucker jmp mbin_ptr_sz [%1_dispatched] 796df3ef80SGreg Tucker%endmacro 806df3ef80SGreg Tucker 816df3ef80SGreg Tucker;;;;; 826df3ef80SGreg Tucker; mbin_dispatch_init parameters 836df3ef80SGreg Tucker; Use this function when SSE/00/01 is a minimum requirement 846df3ef80SGreg Tucker; 1-> function name 856df3ef80SGreg Tucker; 2-> SSE/00/01 optimized function used as base 866df3ef80SGreg Tucker; 3-> AVX or AVX/02 opt func 876df3ef80SGreg Tucker; 4-> AVX2 or AVX/04 opt func 886df3ef80SGreg Tucker;;;;; 896df3ef80SGreg Tucker%macro mbin_dispatch_init 4 906df3ef80SGreg Tucker section .text 916df3ef80SGreg Tucker %1_dispatch_init: 926df3ef80SGreg Tucker push mbin_rsi 936df3ef80SGreg Tucker push mbin_rax 946df3ef80SGreg Tucker push mbin_rbx 956df3ef80SGreg Tucker push mbin_rcx 966df3ef80SGreg Tucker push mbin_rdx 976df3ef80SGreg Tucker lea mbin_rsi, [%2 WRT_OPT] ; Default to SSE 00/01 986df3ef80SGreg Tucker 996df3ef80SGreg Tucker mov eax, 1 1006df3ef80SGreg Tucker cpuid 1016df3ef80SGreg Tucker and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) 1026df3ef80SGreg Tucker cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) 1036df3ef80SGreg Tucker lea mbin_rbx, [%3 WRT_OPT] ; AVX (gen2) opt func 1046df3ef80SGreg Tucker jne _%1_init_done ; AVX is not available so end 1056df3ef80SGreg Tucker mov mbin_rsi, mbin_rbx 1066df3ef80SGreg Tucker 1076df3ef80SGreg Tucker ;; Try for AVX2 1086df3ef80SGreg Tucker xor ecx, ecx 1096df3ef80SGreg Tucker mov eax, 7 1106df3ef80SGreg Tucker cpuid 1116df3ef80SGreg Tucker test ebx, FLAG_CPUID7_EBX_AVX2 1126df3ef80SGreg Tucker lea mbin_rbx, [%4 WRT_OPT] ; AVX (gen4) opt func 1136df3ef80SGreg Tucker cmovne mbin_rsi, mbin_rbx 1146df3ef80SGreg Tucker 1156df3ef80SGreg Tucker ;; Does it have xmm and ymm support 1166df3ef80SGreg Tucker xor ecx, ecx 1176df3ef80SGreg Tucker xgetbv 1186df3ef80SGreg Tucker and eax, FLAG_XGETBV_EAX_XMM_YMM 1196df3ef80SGreg Tucker cmp eax, FLAG_XGETBV_EAX_XMM_YMM 1206df3ef80SGreg Tucker je _%1_init_done 1216df3ef80SGreg Tucker lea mbin_rsi, [%2 WRT_OPT] 1226df3ef80SGreg Tucker 1236df3ef80SGreg Tucker _%1_init_done: 1246df3ef80SGreg Tucker pop mbin_rdx 1256df3ef80SGreg Tucker pop mbin_rcx 1266df3ef80SGreg Tucker pop mbin_rbx 1276df3ef80SGreg Tucker pop mbin_rax 1286df3ef80SGreg Tucker mov [%1_dispatched], mbin_rsi 1296df3ef80SGreg Tucker pop mbin_rsi 1306df3ef80SGreg Tucker ret 1316df3ef80SGreg Tucker%endmacro 1326df3ef80SGreg Tucker 1336df3ef80SGreg Tucker;;;;; 1346df3ef80SGreg Tucker; mbin_dispatch_init2 parameters 1356df3ef80SGreg Tucker; Cases where only base functions are available 1366df3ef80SGreg Tucker; 1-> function name 1376df3ef80SGreg Tucker; 2-> base function 1386df3ef80SGreg Tucker;;;;; 1396df3ef80SGreg Tucker%macro mbin_dispatch_init2 2 1406df3ef80SGreg Tucker section .text 1416df3ef80SGreg Tucker %1_dispatch_init: 1426df3ef80SGreg Tucker push mbin_rsi 1436df3ef80SGreg Tucker lea mbin_rsi, [%2 WRT_OPT] ; Default 1446df3ef80SGreg Tucker mov [%1_dispatched], mbin_rsi 1456df3ef80SGreg Tucker pop mbin_rsi 1466df3ef80SGreg Tucker ret 1476df3ef80SGreg Tucker%endmacro 1486df3ef80SGreg Tucker 1496df3ef80SGreg Tucker;;;;; 1506df3ef80SGreg Tucker; mbin_dispatch_init5 parameters 1516df3ef80SGreg Tucker; 1-> function name 1526df3ef80SGreg Tucker; 2-> base function 1536df3ef80SGreg Tucker; 3-> SSE4_1 or 00/01 optimized function 1546df3ef80SGreg Tucker; 4-> AVX/02 opt func 1556df3ef80SGreg Tucker; 5-> AVX2/04 opt func 1566df3ef80SGreg Tucker;;;;; 1576df3ef80SGreg Tucker%macro mbin_dispatch_init5 5 1586df3ef80SGreg Tucker section .text 1596df3ef80SGreg Tucker %1_dispatch_init: 1606df3ef80SGreg Tucker push mbin_rsi 1616df3ef80SGreg Tucker push mbin_rax 1626df3ef80SGreg Tucker push mbin_rbx 1636df3ef80SGreg Tucker push mbin_rcx 1646df3ef80SGreg Tucker push mbin_rdx 1656df3ef80SGreg Tucker lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function 1666df3ef80SGreg Tucker 1676df3ef80SGreg Tucker mov eax, 1 1686df3ef80SGreg Tucker cpuid 1696df3ef80SGreg Tucker ; Test for SSE4.1 1706df3ef80SGreg Tucker test ecx, FLAG_CPUID1_ECX_SSE4_1 1716df3ef80SGreg Tucker lea mbin_rbx, [%3 WRT_OPT] ; SSE opt func 1726df3ef80SGreg Tucker cmovne mbin_rsi, mbin_rbx 1736df3ef80SGreg Tucker 1746df3ef80SGreg Tucker and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) 1756df3ef80SGreg Tucker cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) 1766df3ef80SGreg Tucker lea mbin_rbx, [%4 WRT_OPT] ; AVX (gen2) opt func 1776df3ef80SGreg Tucker jne _%1_init_done ; AVX is not available so end 1786df3ef80SGreg Tucker mov mbin_rsi, mbin_rbx 1796df3ef80SGreg Tucker 1806df3ef80SGreg Tucker ;; Try for AVX2 1816df3ef80SGreg Tucker xor ecx, ecx 1826df3ef80SGreg Tucker mov eax, 7 1836df3ef80SGreg Tucker cpuid 1846df3ef80SGreg Tucker test ebx, FLAG_CPUID7_EBX_AVX2 1856df3ef80SGreg Tucker lea mbin_rbx, [%5 WRT_OPT] ; AVX (gen4) opt func 1866df3ef80SGreg Tucker cmovne mbin_rsi, mbin_rbx 1876df3ef80SGreg Tucker 1886df3ef80SGreg Tucker ;; Does it have xmm and ymm support 1896df3ef80SGreg Tucker xor ecx, ecx 1906df3ef80SGreg Tucker xgetbv 1916df3ef80SGreg Tucker and eax, FLAG_XGETBV_EAX_XMM_YMM 1926df3ef80SGreg Tucker cmp eax, FLAG_XGETBV_EAX_XMM_YMM 1936df3ef80SGreg Tucker je _%1_init_done 1946df3ef80SGreg Tucker lea mbin_rsi, [%3 WRT_OPT] 1956df3ef80SGreg Tucker 1966df3ef80SGreg Tucker _%1_init_done: 1976df3ef80SGreg Tucker pop mbin_rdx 1986df3ef80SGreg Tucker pop mbin_rcx 1996df3ef80SGreg Tucker pop mbin_rbx 2006df3ef80SGreg Tucker pop mbin_rax 2016df3ef80SGreg Tucker mov [%1_dispatched], mbin_rsi 2026df3ef80SGreg Tucker pop mbin_rsi 2036df3ef80SGreg Tucker ret 2046df3ef80SGreg Tucker%endmacro 2056df3ef80SGreg Tucker 2068dc5d913SGreg Tucker%if AS_FEATURE_LEVEL >= 6 2076df3ef80SGreg Tucker;;;;; 2086df3ef80SGreg Tucker; mbin_dispatch_init6 parameters 2096df3ef80SGreg Tucker; 1-> function name 2106df3ef80SGreg Tucker; 2-> base function 2116df3ef80SGreg Tucker; 3-> SSE4_1 or 00/01 optimized function 2126df3ef80SGreg Tucker; 4-> AVX/02 opt func 2136df3ef80SGreg Tucker; 5-> AVX2/04 opt func 2146df3ef80SGreg Tucker; 6-> AVX512/06 opt func 2156df3ef80SGreg Tucker;;;;; 2166df3ef80SGreg Tucker%macro mbin_dispatch_init6 6 2176df3ef80SGreg Tucker section .text 2186df3ef80SGreg Tucker %1_dispatch_init: 2196df3ef80SGreg Tucker push mbin_rsi 2206df3ef80SGreg Tucker push mbin_rax 2216df3ef80SGreg Tucker push mbin_rbx 2226df3ef80SGreg Tucker push mbin_rcx 2236df3ef80SGreg Tucker push mbin_rdx 2246df3ef80SGreg Tucker push mbin_rdi 2256df3ef80SGreg Tucker lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function 2266df3ef80SGreg Tucker 2276df3ef80SGreg Tucker mov eax, 1 2286df3ef80SGreg Tucker cpuid 2296df3ef80SGreg Tucker mov ebx, ecx ; save cpuid1.ecx 2306df3ef80SGreg Tucker test ecx, FLAG_CPUID1_ECX_SSE4_1 2316df3ef80SGreg Tucker je _%1_init_done ; Use base function if no SSE4_1 2326df3ef80SGreg Tucker lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt 2336df3ef80SGreg Tucker 2346df3ef80SGreg Tucker ;; Test for XMM_YMM support/AVX 2356df3ef80SGreg Tucker test ecx, FLAG_CPUID1_ECX_OSXSAVE 2366df3ef80SGreg Tucker je _%1_init_done 2376df3ef80SGreg Tucker xor ecx, ecx 2386df3ef80SGreg Tucker xgetbv ; xcr -> edx:eax 2396df3ef80SGreg Tucker mov edi, eax ; save xgetvb.eax 2406df3ef80SGreg Tucker 2416df3ef80SGreg Tucker and eax, FLAG_XGETBV_EAX_XMM_YMM 2426df3ef80SGreg Tucker cmp eax, FLAG_XGETBV_EAX_XMM_YMM 2436df3ef80SGreg Tucker jne _%1_init_done 2446df3ef80SGreg Tucker test ebx, FLAG_CPUID1_ECX_AVX 2456df3ef80SGreg Tucker je _%1_init_done 2466df3ef80SGreg Tucker lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt 2476df3ef80SGreg Tucker 2486df3ef80SGreg Tucker ;; Test for AVX2 2496df3ef80SGreg Tucker xor ecx, ecx 2506df3ef80SGreg Tucker mov eax, 7 2516df3ef80SGreg Tucker cpuid 2526df3ef80SGreg Tucker test ebx, FLAG_CPUID7_EBX_AVX2 2536df3ef80SGreg Tucker je _%1_init_done ; No AVX2 possible 2546df3ef80SGreg Tucker lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func 2556df3ef80SGreg Tucker 2566df3ef80SGreg Tucker ;; Test for AVX512 2576df3ef80SGreg Tucker and edi, FLAG_XGETBV_EAX_ZMM_OPM 2586df3ef80SGreg Tucker cmp edi, FLAG_XGETBV_EAX_ZMM_OPM 2596df3ef80SGreg Tucker jne _%1_init_done ; No AVX512 possible 260dcce8ecbSGreg Tucker and ebx, FLAGS_CPUID7_EBX_AVX512_G1 261dcce8ecbSGreg Tucker cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1 2626df3ef80SGreg Tucker lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt 2636df3ef80SGreg Tucker cmove mbin_rsi, mbin_rbx 2646df3ef80SGreg Tucker 2656df3ef80SGreg Tucker _%1_init_done: 2666df3ef80SGreg Tucker pop mbin_rdi 2676df3ef80SGreg Tucker pop mbin_rdx 2686df3ef80SGreg Tucker pop mbin_rcx 2696df3ef80SGreg Tucker pop mbin_rbx 2706df3ef80SGreg Tucker pop mbin_rax 2716df3ef80SGreg Tucker mov [%1_dispatched], mbin_rsi 2726df3ef80SGreg Tucker pop mbin_rsi 2736df3ef80SGreg Tucker ret 2746df3ef80SGreg Tucker%endmacro 2756df3ef80SGreg Tucker 2768dc5d913SGreg Tucker%else 2778dc5d913SGreg Tucker%macro mbin_dispatch_init6 6 2788dc5d913SGreg Tucker mbin_dispatch_init5 %1, %2, %3, %4, %5 2798dc5d913SGreg Tucker%endmacro 2808dc5d913SGreg Tucker%endif 2818dc5d913SGreg Tucker 2828dc5d913SGreg Tucker%if AS_FEATURE_LEVEL >= 10 2838dc5d913SGreg Tucker;;;;; 2848dc5d913SGreg Tucker; mbin_dispatch_init7 parameters 2858dc5d913SGreg Tucker; 1-> function name 2868dc5d913SGreg Tucker; 2-> base function 2878dc5d913SGreg Tucker; 3-> SSE4_2 or 00/01 optimized function 2888dc5d913SGreg Tucker; 4-> AVX/02 opt func 2898dc5d913SGreg Tucker; 5-> AVX2/04 opt func 2908dc5d913SGreg Tucker; 6-> AVX512/06 opt func 2918dc5d913SGreg Tucker; 7-> AVX512 Update/10 opt func 2928dc5d913SGreg Tucker;;;;; 2938dc5d913SGreg Tucker%macro mbin_dispatch_init7 7 2948dc5d913SGreg Tucker section .text 2958dc5d913SGreg Tucker %1_dispatch_init: 2968dc5d913SGreg Tucker push mbin_rsi 2978dc5d913SGreg Tucker push mbin_rax 2988dc5d913SGreg Tucker push mbin_rbx 2998dc5d913SGreg Tucker push mbin_rcx 3008dc5d913SGreg Tucker push mbin_rdx 3018dc5d913SGreg Tucker push mbin_rdi 3028dc5d913SGreg Tucker lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function 3038dc5d913SGreg Tucker 3048dc5d913SGreg Tucker mov eax, 1 3058dc5d913SGreg Tucker cpuid 3068dc5d913SGreg Tucker mov ebx, ecx ; save cpuid1.ecx 3078dc5d913SGreg Tucker test ecx, FLAG_CPUID1_ECX_SSE4_2 3088dc5d913SGreg Tucker je _%1_init_done ; Use base function if no SSE4_2 3098dc5d913SGreg Tucker lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt 3108dc5d913SGreg Tucker 3118dc5d913SGreg Tucker ;; Test for XMM_YMM support/AVX 3128dc5d913SGreg Tucker test ecx, FLAG_CPUID1_ECX_OSXSAVE 3138dc5d913SGreg Tucker je _%1_init_done 3148dc5d913SGreg Tucker xor ecx, ecx 3158dc5d913SGreg Tucker xgetbv ; xcr -> edx:eax 3168dc5d913SGreg Tucker mov edi, eax ; save xgetvb.eax 3178dc5d913SGreg Tucker 3188dc5d913SGreg Tucker and eax, FLAG_XGETBV_EAX_XMM_YMM 3198dc5d913SGreg Tucker cmp eax, FLAG_XGETBV_EAX_XMM_YMM 3208dc5d913SGreg Tucker jne _%1_init_done 3218dc5d913SGreg Tucker test ebx, FLAG_CPUID1_ECX_AVX 3228dc5d913SGreg Tucker je _%1_init_done 3238dc5d913SGreg Tucker lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt 3248dc5d913SGreg Tucker 3258dc5d913SGreg Tucker ;; Test for AVX2 3268dc5d913SGreg Tucker xor ecx, ecx 3278dc5d913SGreg Tucker mov eax, 7 3288dc5d913SGreg Tucker cpuid 3298dc5d913SGreg Tucker test ebx, FLAG_CPUID7_EBX_AVX2 3308dc5d913SGreg Tucker je _%1_init_done ; No AVX2 possible 3318dc5d913SGreg Tucker lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func 3328dc5d913SGreg Tucker 3338dc5d913SGreg Tucker ;; Test for AVX512 3348dc5d913SGreg Tucker and edi, FLAG_XGETBV_EAX_ZMM_OPM 3358dc5d913SGreg Tucker cmp edi, FLAG_XGETBV_EAX_ZMM_OPM 3368dc5d913SGreg Tucker jne _%1_init_done ; No AVX512 possible 337dcce8ecbSGreg Tucker and ebx, FLAGS_CPUID7_EBX_AVX512_G1 338dcce8ecbSGreg Tucker cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1 3398dc5d913SGreg Tucker lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt 3408dc5d913SGreg Tucker cmove mbin_rsi, mbin_rbx 3418dc5d913SGreg Tucker 3428dc5d913SGreg Tucker and ecx, FLAGS_CPUID7_ECX_AVX512_G2 3438dc5d913SGreg Tucker cmp ecx, FLAGS_CPUID7_ECX_AVX512_G2 3448dc5d913SGreg Tucker lea mbin_rbx, [%7 WRT_OPT] ; AVX512/06 opt 3458dc5d913SGreg Tucker cmove mbin_rsi, mbin_rbx 3468dc5d913SGreg Tucker 3478dc5d913SGreg Tucker _%1_init_done: 3488dc5d913SGreg Tucker pop mbin_rdi 3498dc5d913SGreg Tucker pop mbin_rdx 3508dc5d913SGreg Tucker pop mbin_rcx 3518dc5d913SGreg Tucker pop mbin_rbx 3528dc5d913SGreg Tucker pop mbin_rax 3538dc5d913SGreg Tucker mov [%1_dispatched], mbin_rsi 3548dc5d913SGreg Tucker pop mbin_rsi 3558dc5d913SGreg Tucker ret 3568dc5d913SGreg Tucker%endmacro 3578dc5d913SGreg Tucker%else 3588dc5d913SGreg Tucker%macro mbin_dispatch_init7 7 3598dc5d913SGreg Tucker mbin_dispatch_init6 %1, %2, %3, %4, %5, %6 3608dc5d913SGreg Tucker%endmacro 3618dc5d913SGreg Tucker%endif 3628dc5d913SGreg Tucker 36330604006SXiaodong Liu;;;;; 36430604006SXiaodong Liu; mbin_dispatch_sse_to_avx2_shani parameters 36530604006SXiaodong Liu; derived from mbin_dispatch_init 36630604006SXiaodong Liu; Use this function when SSE/00/01 is a minimum requirement 36730604006SXiaodong Liu; 1-> function name 36830604006SXiaodong Liu; 2-> SSE/00/01 optimized function used as base 36930604006SXiaodong Liu; 3-> AVX or AVX/02 opt func 37030604006SXiaodong Liu; 4-> AVX2 or AVX/04 opt func 37130604006SXiaodong Liu; 5-> SHANI opt for GLM 37230604006SXiaodong Liu;;;;; 37330604006SXiaodong Liu%macro mbin_dispatch_sse_to_avx2_shani 5 37430604006SXiaodong Liu section .text 37530604006SXiaodong Liu %1_dispatch_init: 37630604006SXiaodong Liu push mbin_rsi 37730604006SXiaodong Liu push mbin_rax 37830604006SXiaodong Liu push mbin_rbx 37930604006SXiaodong Liu push mbin_rcx 38030604006SXiaodong Liu push mbin_rdx 38130604006SXiaodong Liu lea mbin_rsi, [%2 WRT_OPT] ; Default to SSE 00/01 38230604006SXiaodong Liu 38330604006SXiaodong Liu mov eax, 1 38430604006SXiaodong Liu cpuid 38530604006SXiaodong Liu and ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) 38630604006SXiaodong Liu cmp ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE) 38730604006SXiaodong Liu lea mbin_rbx, [%3 WRT_OPT] ; AVX (gen2) opt func 38830604006SXiaodong Liu jne _%1_shani_check ; AVX is not available so check shani 38930604006SXiaodong Liu mov mbin_rsi, mbin_rbx 39030604006SXiaodong Liu 39130604006SXiaodong Liu ;; Try for AVX2 39230604006SXiaodong Liu xor ecx, ecx 39330604006SXiaodong Liu mov eax, 7 39430604006SXiaodong Liu cpuid 39530604006SXiaodong Liu test ebx, FLAG_CPUID7_EBX_AVX2 39630604006SXiaodong Liu lea mbin_rbx, [%4 WRT_OPT] ; AVX (gen4) opt func 39730604006SXiaodong Liu cmovne mbin_rsi, mbin_rbx 39830604006SXiaodong Liu 39930604006SXiaodong Liu ;; Does it have xmm and ymm support 40030604006SXiaodong Liu xor ecx, ecx 40130604006SXiaodong Liu xgetbv 40230604006SXiaodong Liu and eax, FLAG_XGETBV_EAX_XMM_YMM 40330604006SXiaodong Liu cmp eax, FLAG_XGETBV_EAX_XMM_YMM 40430604006SXiaodong Liu je _%1_init_done 40530604006SXiaodong Liu lea mbin_rsi, [%2 WRT_OPT] 40630604006SXiaodong Liu 40730604006SXiaodong Liu _%1_init_done: 40830604006SXiaodong Liu pop mbin_rdx 40930604006SXiaodong Liu pop mbin_rcx 41030604006SXiaodong Liu pop mbin_rbx 41130604006SXiaodong Liu pop mbin_rax 41230604006SXiaodong Liu mov [%1_dispatched], mbin_rsi 41330604006SXiaodong Liu pop mbin_rsi 41430604006SXiaodong Liu ret 41530604006SXiaodong Liu 41630604006SXiaodong Liu _%1_shani_check: 41730604006SXiaodong Liu xor ecx, ecx 41830604006SXiaodong Liu mov eax, 7 41930604006SXiaodong Liu cpuid 42030604006SXiaodong Liu test ebx, FLAG_CPUID7_EBX_SHA 42130604006SXiaodong Liu lea mbin_rbx, [%5 WRT_OPT] ; SHANI opt func 42230604006SXiaodong Liu cmovne mbin_rsi, mbin_rbx 42330604006SXiaodong Liu jmp _%1_init_done ; end 42430604006SXiaodong Liu%endmacro 42530604006SXiaodong Liu 42630604006SXiaodong Liu;;;;; 42730604006SXiaodong Liu; mbin_dispatch_base_to_avx512_shani parameters 42830604006SXiaodong Liu; derived from mbin_dispatch_init6 42930604006SXiaodong Liu; 1-> function name 43030604006SXiaodong Liu; 2-> base function 43130604006SXiaodong Liu; 3-> SSE4_2 or 00/01 optimized function 43230604006SXiaodong Liu; 4-> AVX/02 opt func 43330604006SXiaodong Liu; 5-> AVX2/04 opt func 43430604006SXiaodong Liu; 6-> AVX512/06 opt func 43530604006SXiaodong Liu; 7-> SHANI opt for GLM 43630604006SXiaodong Liu; 8-> SHANI opt for CNL 43730604006SXiaodong Liu;;;;; 43830604006SXiaodong Liu%macro mbin_dispatch_base_to_avx512_shani 8 43930604006SXiaodong Liu section .text 44030604006SXiaodong Liu %1_dispatch_init: 44130604006SXiaodong Liu push mbin_rsi 44230604006SXiaodong Liu push mbin_rax 44330604006SXiaodong Liu push mbin_rbx 44430604006SXiaodong Liu push mbin_rcx 44530604006SXiaodong Liu push mbin_rdx 44630604006SXiaodong Liu push mbin_rdi 44730604006SXiaodong Liu lea mbin_rsi, [%2 WRT_OPT] ; Default - use base function 44830604006SXiaodong Liu 44930604006SXiaodong Liu mov eax, 1 45030604006SXiaodong Liu cpuid 45130604006SXiaodong Liu mov ebx, ecx ; save cpuid1.ecx 45230604006SXiaodong Liu test ecx, FLAG_CPUID1_ECX_SSE4_2 45330604006SXiaodong Liu je _%1_init_done ; Use base function if no SSE4_2 45430604006SXiaodong Liu lea mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt 45530604006SXiaodong Liu 45630604006SXiaodong Liu ;; Test for XMM_YMM support/AVX 45730604006SXiaodong Liu test ecx, FLAG_CPUID1_ECX_OSXSAVE 45830604006SXiaodong Liu je _%1_shani_check 45930604006SXiaodong Liu xor ecx, ecx 46030604006SXiaodong Liu xgetbv ; xcr -> edx:eax 46130604006SXiaodong Liu mov edi, eax ; save xgetvb.eax 46230604006SXiaodong Liu 46330604006SXiaodong Liu and eax, FLAG_XGETBV_EAX_XMM_YMM 46430604006SXiaodong Liu cmp eax, FLAG_XGETBV_EAX_XMM_YMM 46530604006SXiaodong Liu jne _%1_shani_check 46630604006SXiaodong Liu test ebx, FLAG_CPUID1_ECX_AVX 46730604006SXiaodong Liu je _%1_shani_check 46830604006SXiaodong Liu lea mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt 46930604006SXiaodong Liu 47030604006SXiaodong Liu ;; Test for AVX2 47130604006SXiaodong Liu xor ecx, ecx 47230604006SXiaodong Liu mov eax, 7 47330604006SXiaodong Liu cpuid 47430604006SXiaodong Liu test ebx, FLAG_CPUID7_EBX_AVX2 47530604006SXiaodong Liu je _%1_init_done ; No AVX2 possible 47630604006SXiaodong Liu lea mbin_rsi, [%5 WRT_OPT] ; AVX2/04 opt func 47730604006SXiaodong Liu 47830604006SXiaodong Liu ;; Test for AVX512 47930604006SXiaodong Liu and edi, FLAG_XGETBV_EAX_ZMM_OPM 48030604006SXiaodong Liu cmp edi, FLAG_XGETBV_EAX_ZMM_OPM 48130604006SXiaodong Liu jne _%1_init_done ; No AVX512 possible 482dcce8ecbSGreg Tucker and ebx, FLAGS_CPUID7_EBX_AVX512_G1 483dcce8ecbSGreg Tucker cmp ebx, FLAGS_CPUID7_EBX_AVX512_G1 48430604006SXiaodong Liu lea mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt 48530604006SXiaodong Liu cmove mbin_rsi, mbin_rbx 48630604006SXiaodong Liu 48730604006SXiaodong Liu ;; Test for SHANI 48830604006SXiaodong Liu xor ecx, ecx 48930604006SXiaodong Liu mov eax, 7 49030604006SXiaodong Liu cpuid 49130604006SXiaodong Liu test ebx, FLAG_CPUID7_EBX_SHA 49230604006SXiaodong Liu lea mbin_rbx, [%8 WRT_OPT] ; SHANI opt sse func 49330604006SXiaodong Liu cmovne mbin_rsi, mbin_rbx 49430604006SXiaodong Liu 49530604006SXiaodong Liu _%1_init_done: 49630604006SXiaodong Liu pop mbin_rdi 49730604006SXiaodong Liu pop mbin_rdx 49830604006SXiaodong Liu pop mbin_rcx 49930604006SXiaodong Liu pop mbin_rbx 50030604006SXiaodong Liu pop mbin_rax 50130604006SXiaodong Liu mov [%1_dispatched], mbin_rsi 50230604006SXiaodong Liu pop mbin_rsi 50330604006SXiaodong Liu ret 50430604006SXiaodong Liu 50530604006SXiaodong Liu _%1_shani_check: 50630604006SXiaodong Liu xor ecx, ecx 50730604006SXiaodong Liu mov eax, 7 50830604006SXiaodong Liu cpuid 50930604006SXiaodong Liu test ebx, FLAG_CPUID7_EBX_SHA 51030604006SXiaodong Liu lea mbin_rbx, [%7 WRT_OPT] ; SHANI opt sse func 51130604006SXiaodong Liu cmovne mbin_rsi, mbin_rbx 51230604006SXiaodong Liu jmp _%1_init_done ; end 51330604006SXiaodong Liu%endmacro 51430604006SXiaodong Liu 51530604006SXiaodong Liu 51630604006SXiaodong Liu 5176df3ef80SGreg Tucker%endif ; ifndef _MULTIBINARY_ASM_ 518