xref: /isa-l_crypto/include/multibinary.asm (revision 860585444755e926bd72600b39758458c0a1c9da)
16df3ef80SGreg Tucker;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
28dc5d913SGreg Tucker;  Copyright(c) 2011-2019 Intel Corporation All rights reserved.
36df3ef80SGreg Tucker;
46df3ef80SGreg Tucker;  Redistribution and use in source and binary forms, with or without
56df3ef80SGreg Tucker;  modification, are permitted provided that the following conditions
66df3ef80SGreg Tucker;  are met:
76df3ef80SGreg Tucker;    * Redistributions of source code must retain the above copyright
86df3ef80SGreg Tucker;      notice, this list of conditions and the following disclaimer.
96df3ef80SGreg Tucker;    * Redistributions in binary form must reproduce the above copyright
106df3ef80SGreg Tucker;      notice, this list of conditions and the following disclaimer in
116df3ef80SGreg Tucker;      the documentation and/or other materials provided with the
126df3ef80SGreg Tucker;      distribution.
136df3ef80SGreg Tucker;    * Neither the name of Intel Corporation nor the names of its
146df3ef80SGreg Tucker;      contributors may be used to endorse or promote products derived
156df3ef80SGreg Tucker;      from this software without specific prior written permission.
166df3ef80SGreg Tucker;
176df3ef80SGreg Tucker;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
186df3ef80SGreg Tucker;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
196df3ef80SGreg Tucker;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
206df3ef80SGreg Tucker;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
216df3ef80SGreg Tucker;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
226df3ef80SGreg Tucker;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
236df3ef80SGreg Tucker;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
246df3ef80SGreg Tucker;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
256df3ef80SGreg Tucker;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
266df3ef80SGreg Tucker;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
276df3ef80SGreg Tucker;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
286df3ef80SGreg Tucker;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
296df3ef80SGreg Tucker
306df3ef80SGreg Tucker%ifndef _MULTIBINARY_ASM_
316df3ef80SGreg Tucker%define _MULTIBINARY_ASM_
326df3ef80SGreg Tucker
336df3ef80SGreg Tucker%ifidn __OUTPUT_FORMAT__, elf32
346df3ef80SGreg Tucker %define mbin_def_ptr	dd
356df3ef80SGreg Tucker %define mbin_ptr_sz	dword
366df3ef80SGreg Tucker %define mbin_rdi	edi
376df3ef80SGreg Tucker %define mbin_rsi	esi
386df3ef80SGreg Tucker %define mbin_rax	eax
396df3ef80SGreg Tucker %define mbin_rbx	ebx
406df3ef80SGreg Tucker %define mbin_rcx	ecx
416df3ef80SGreg Tucker %define mbin_rdx	edx
426df3ef80SGreg Tucker%else
436df3ef80SGreg Tucker %define mbin_def_ptr	dq
446df3ef80SGreg Tucker %define mbin_ptr_sz	qword
456df3ef80SGreg Tucker %define mbin_rdi	rdi
466df3ef80SGreg Tucker %define mbin_rsi	rsi
476df3ef80SGreg Tucker %define mbin_rax	rax
486df3ef80SGreg Tucker %define mbin_rbx	rbx
496df3ef80SGreg Tucker %define mbin_rcx	rcx
506df3ef80SGreg Tucker %define mbin_rdx	rdx
516df3ef80SGreg Tucker%endif
526df3ef80SGreg Tucker
538dc5d913SGreg Tucker%ifndef AS_FEATURE_LEVEL
548dc5d913SGreg Tucker%define AS_FEATURE_LEVEL 4
558dc5d913SGreg Tucker%endif
568dc5d913SGreg Tucker
576df3ef80SGreg Tucker;;;;
586df3ef80SGreg Tucker; multibinary macro:
59*86058544SPablo de Lara;   creates the visible entry point that uses HW optimized call pointer
606df3ef80SGreg Tucker;   creates the init of the HW optimized call pointer
616df3ef80SGreg Tucker;;;;
626df3ef80SGreg Tucker%macro mbin_interface 1
636df3ef80SGreg Tucker	;;;;
646df3ef80SGreg Tucker	; *_dispatched is defaulted to *_mbinit and replaced on first call.
656df3ef80SGreg Tucker	; Therefore, *_dispatch_init is only executed on first call.
666df3ef80SGreg Tucker	;;;;
676df3ef80SGreg Tucker	section .data
686df3ef80SGreg Tucker	%1_dispatched:
696df3ef80SGreg Tucker		mbin_def_ptr	%1_mbinit
706df3ef80SGreg Tucker
716df3ef80SGreg Tucker	section .text
720e4f088aSGreg Tucker	mk_global %1, function
736df3ef80SGreg Tucker	%1_mbinit:
746df3ef80SGreg Tucker		;;; only called the first time to setup hardware match
756df3ef80SGreg Tucker		call	%1_dispatch_init
766df3ef80SGreg Tucker		;;; falls thru to execute the hw optimized code
776df3ef80SGreg Tucker	%1:
786df3ef80SGreg Tucker		jmp	mbin_ptr_sz [%1_dispatched]
796df3ef80SGreg Tucker%endmacro
806df3ef80SGreg Tucker
816df3ef80SGreg Tucker;;;;;
826df3ef80SGreg Tucker; mbin_dispatch_init parameters
836df3ef80SGreg Tucker; Use this function when SSE/00/01 is a minimum requirement
846df3ef80SGreg Tucker; 1-> function name
856df3ef80SGreg Tucker; 2-> SSE/00/01 optimized function used as base
866df3ef80SGreg Tucker; 3-> AVX or AVX/02 opt func
876df3ef80SGreg Tucker; 4-> AVX2 or AVX/04 opt func
886df3ef80SGreg Tucker;;;;;
896df3ef80SGreg Tucker%macro mbin_dispatch_init 4
906df3ef80SGreg Tucker	section .text
916df3ef80SGreg Tucker	%1_dispatch_init:
926df3ef80SGreg Tucker		push	mbin_rsi
936df3ef80SGreg Tucker		push	mbin_rax
946df3ef80SGreg Tucker		push	mbin_rbx
956df3ef80SGreg Tucker		push	mbin_rcx
966df3ef80SGreg Tucker		push	mbin_rdx
976df3ef80SGreg Tucker		lea	mbin_rsi, [%2 WRT_OPT] ; Default to SSE 00/01
986df3ef80SGreg Tucker
996df3ef80SGreg Tucker		mov	eax, 1
1006df3ef80SGreg Tucker		cpuid
1016df3ef80SGreg Tucker		and	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
1026df3ef80SGreg Tucker		cmp	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
1036df3ef80SGreg Tucker		lea	mbin_rbx, [%3 WRT_OPT] ; AVX (gen2) opt func
1046df3ef80SGreg Tucker		jne	_%1_init_done ; AVX is not available so end
1056df3ef80SGreg Tucker		mov	mbin_rsi, mbin_rbx
1066df3ef80SGreg Tucker
1076df3ef80SGreg Tucker		;; Try for AVX2
1086df3ef80SGreg Tucker		xor	ecx, ecx
1096df3ef80SGreg Tucker		mov	eax, 7
1106df3ef80SGreg Tucker		cpuid
1116df3ef80SGreg Tucker		test	ebx, FLAG_CPUID7_EBX_AVX2
1126df3ef80SGreg Tucker		lea	mbin_rbx, [%4 WRT_OPT] ; AVX (gen4) opt func
1136df3ef80SGreg Tucker		cmovne	mbin_rsi, mbin_rbx
1146df3ef80SGreg Tucker
1156df3ef80SGreg Tucker		;; Does it have xmm and ymm support
1166df3ef80SGreg Tucker		xor	ecx, ecx
1176df3ef80SGreg Tucker		xgetbv
1186df3ef80SGreg Tucker		and	eax, FLAG_XGETBV_EAX_XMM_YMM
1196df3ef80SGreg Tucker		cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
1206df3ef80SGreg Tucker		je	_%1_init_done
1216df3ef80SGreg Tucker		lea	mbin_rsi, [%2 WRT_OPT]
1226df3ef80SGreg Tucker
1236df3ef80SGreg Tucker	_%1_init_done:
1246df3ef80SGreg Tucker		pop	mbin_rdx
1256df3ef80SGreg Tucker		pop	mbin_rcx
1266df3ef80SGreg Tucker		pop	mbin_rbx
1276df3ef80SGreg Tucker		pop	mbin_rax
1286df3ef80SGreg Tucker		mov	[%1_dispatched], mbin_rsi
1296df3ef80SGreg Tucker		pop	mbin_rsi
1306df3ef80SGreg Tucker		ret
1316df3ef80SGreg Tucker%endmacro
1326df3ef80SGreg Tucker
1336df3ef80SGreg Tucker;;;;;
1346df3ef80SGreg Tucker; mbin_dispatch_init2 parameters
1356df3ef80SGreg Tucker;  Cases where only base functions are available
1366df3ef80SGreg Tucker; 1-> function name
1376df3ef80SGreg Tucker; 2-> base function
1386df3ef80SGreg Tucker;;;;;
1396df3ef80SGreg Tucker%macro mbin_dispatch_init2 2
1406df3ef80SGreg Tucker	section .text
1416df3ef80SGreg Tucker	%1_dispatch_init:
1426df3ef80SGreg Tucker		push	mbin_rsi
1436df3ef80SGreg Tucker		lea	mbin_rsi, [%2 WRT_OPT] ; Default
1446df3ef80SGreg Tucker		mov	[%1_dispatched], mbin_rsi
1456df3ef80SGreg Tucker		pop	mbin_rsi
1466df3ef80SGreg Tucker		ret
1476df3ef80SGreg Tucker%endmacro
1486df3ef80SGreg Tucker
1496df3ef80SGreg Tucker;;;;;
1506df3ef80SGreg Tucker; mbin_dispatch_init5 parameters
1516df3ef80SGreg Tucker; 1-> function name
1526df3ef80SGreg Tucker; 2-> base function
1536df3ef80SGreg Tucker; 3-> SSE4_1 or 00/01 optimized function
1546df3ef80SGreg Tucker; 4-> AVX/02 opt func
1556df3ef80SGreg Tucker; 5-> AVX2/04 opt func
1566df3ef80SGreg Tucker;;;;;
1576df3ef80SGreg Tucker%macro mbin_dispatch_init5 5
1586df3ef80SGreg Tucker	section .text
1596df3ef80SGreg Tucker	%1_dispatch_init:
1606df3ef80SGreg Tucker		push	mbin_rsi
1616df3ef80SGreg Tucker		push	mbin_rax
1626df3ef80SGreg Tucker		push	mbin_rbx
1636df3ef80SGreg Tucker		push	mbin_rcx
1646df3ef80SGreg Tucker		push	mbin_rdx
1656df3ef80SGreg Tucker		lea	mbin_rsi, [%2 WRT_OPT] ; Default - use base function
1666df3ef80SGreg Tucker
1676df3ef80SGreg Tucker		mov	eax, 1
1686df3ef80SGreg Tucker		cpuid
1696df3ef80SGreg Tucker		; Test for SSE4.1
1706df3ef80SGreg Tucker		test	ecx, FLAG_CPUID1_ECX_SSE4_1
1716df3ef80SGreg Tucker		lea	mbin_rbx, [%3 WRT_OPT] ; SSE opt func
1726df3ef80SGreg Tucker		cmovne	mbin_rsi, mbin_rbx
1736df3ef80SGreg Tucker
1746df3ef80SGreg Tucker		and	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
1756df3ef80SGreg Tucker		cmp	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
1766df3ef80SGreg Tucker		lea	mbin_rbx, [%4 WRT_OPT] ; AVX (gen2) opt func
1776df3ef80SGreg Tucker		jne	_%1_init_done ; AVX is not available so end
1786df3ef80SGreg Tucker		mov	mbin_rsi, mbin_rbx
1796df3ef80SGreg Tucker
1806df3ef80SGreg Tucker		;; Try for AVX2
1816df3ef80SGreg Tucker		xor	ecx, ecx
1826df3ef80SGreg Tucker		mov	eax, 7
1836df3ef80SGreg Tucker		cpuid
1846df3ef80SGreg Tucker		test	ebx, FLAG_CPUID7_EBX_AVX2
1856df3ef80SGreg Tucker		lea	mbin_rbx, [%5 WRT_OPT] ; AVX (gen4) opt func
1866df3ef80SGreg Tucker		cmovne	mbin_rsi, mbin_rbx
1876df3ef80SGreg Tucker
1886df3ef80SGreg Tucker		;; Does it have xmm and ymm support
1896df3ef80SGreg Tucker		xor	ecx, ecx
1906df3ef80SGreg Tucker		xgetbv
1916df3ef80SGreg Tucker		and	eax, FLAG_XGETBV_EAX_XMM_YMM
1926df3ef80SGreg Tucker		cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
1936df3ef80SGreg Tucker		je	_%1_init_done
1946df3ef80SGreg Tucker		lea	mbin_rsi, [%3 WRT_OPT]
1956df3ef80SGreg Tucker
1966df3ef80SGreg Tucker	_%1_init_done:
1976df3ef80SGreg Tucker		pop	mbin_rdx
1986df3ef80SGreg Tucker		pop	mbin_rcx
1996df3ef80SGreg Tucker		pop	mbin_rbx
2006df3ef80SGreg Tucker		pop	mbin_rax
2016df3ef80SGreg Tucker		mov	[%1_dispatched], mbin_rsi
2026df3ef80SGreg Tucker		pop	mbin_rsi
2036df3ef80SGreg Tucker		ret
2046df3ef80SGreg Tucker%endmacro
2056df3ef80SGreg Tucker
2068dc5d913SGreg Tucker%if AS_FEATURE_LEVEL >= 6
2076df3ef80SGreg Tucker;;;;;
2086df3ef80SGreg Tucker; mbin_dispatch_init6 parameters
2096df3ef80SGreg Tucker; 1-> function name
2106df3ef80SGreg Tucker; 2-> base function
2116df3ef80SGreg Tucker; 3-> SSE4_1 or 00/01 optimized function
2126df3ef80SGreg Tucker; 4-> AVX/02 opt func
2136df3ef80SGreg Tucker; 5-> AVX2/04 opt func
2146df3ef80SGreg Tucker; 6-> AVX512/06 opt func
2156df3ef80SGreg Tucker;;;;;
2166df3ef80SGreg Tucker%macro mbin_dispatch_init6 6
2176df3ef80SGreg Tucker	section .text
2186df3ef80SGreg Tucker	%1_dispatch_init:
2196df3ef80SGreg Tucker		push	mbin_rsi
2206df3ef80SGreg Tucker		push	mbin_rax
2216df3ef80SGreg Tucker		push	mbin_rbx
2226df3ef80SGreg Tucker		push	mbin_rcx
2236df3ef80SGreg Tucker		push	mbin_rdx
2246df3ef80SGreg Tucker		push	mbin_rdi
2256df3ef80SGreg Tucker		lea	mbin_rsi, [%2 WRT_OPT] ; Default - use base function
2266df3ef80SGreg Tucker
2276df3ef80SGreg Tucker		mov	eax, 1
2286df3ef80SGreg Tucker		cpuid
2296df3ef80SGreg Tucker		mov	ebx, ecx ; save cpuid1.ecx
2306df3ef80SGreg Tucker		test	ecx, FLAG_CPUID1_ECX_SSE4_1
2316df3ef80SGreg Tucker		je	_%1_init_done	  ; Use base function if no SSE4_1
2326df3ef80SGreg Tucker		lea	mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
2336df3ef80SGreg Tucker
2346df3ef80SGreg Tucker		;; Test for XMM_YMM support/AVX
2356df3ef80SGreg Tucker		test	ecx, FLAG_CPUID1_ECX_OSXSAVE
2366df3ef80SGreg Tucker		je	_%1_init_done
2376df3ef80SGreg Tucker		xor	ecx, ecx
2386df3ef80SGreg Tucker		xgetbv	; xcr -> edx:eax
2396df3ef80SGreg Tucker		mov	edi, eax	  ; save xgetvb.eax
2406df3ef80SGreg Tucker
2416df3ef80SGreg Tucker		and	eax, FLAG_XGETBV_EAX_XMM_YMM
2426df3ef80SGreg Tucker		cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
2436df3ef80SGreg Tucker		jne	_%1_init_done
2446df3ef80SGreg Tucker		test	ebx, FLAG_CPUID1_ECX_AVX
2456df3ef80SGreg Tucker		je	_%1_init_done
2466df3ef80SGreg Tucker		lea	mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt
2476df3ef80SGreg Tucker
2486df3ef80SGreg Tucker		;; Test for AVX2
2496df3ef80SGreg Tucker		xor	ecx, ecx
2506df3ef80SGreg Tucker		mov	eax, 7
2516df3ef80SGreg Tucker		cpuid
2526df3ef80SGreg Tucker		test	ebx, FLAG_CPUID7_EBX_AVX2
2536df3ef80SGreg Tucker		je	_%1_init_done		; No AVX2 possible
2546df3ef80SGreg Tucker		lea	mbin_rsi, [%5 WRT_OPT] 	; AVX2/04 opt func
2556df3ef80SGreg Tucker
2566df3ef80SGreg Tucker		;; Test for AVX512
2576df3ef80SGreg Tucker		and	edi, FLAG_XGETBV_EAX_ZMM_OPM
2586df3ef80SGreg Tucker		cmp	edi, FLAG_XGETBV_EAX_ZMM_OPM
2596df3ef80SGreg Tucker		jne	_%1_init_done	  ; No AVX512 possible
260dcce8ecbSGreg Tucker		and	ebx, FLAGS_CPUID7_EBX_AVX512_G1
261dcce8ecbSGreg Tucker		cmp	ebx, FLAGS_CPUID7_EBX_AVX512_G1
2626df3ef80SGreg Tucker		lea	mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt
2636df3ef80SGreg Tucker		cmove	mbin_rsi, mbin_rbx
2646df3ef80SGreg Tucker
2656df3ef80SGreg Tucker	_%1_init_done:
2666df3ef80SGreg Tucker		pop	mbin_rdi
2676df3ef80SGreg Tucker		pop	mbin_rdx
2686df3ef80SGreg Tucker		pop	mbin_rcx
2696df3ef80SGreg Tucker		pop	mbin_rbx
2706df3ef80SGreg Tucker		pop	mbin_rax
2716df3ef80SGreg Tucker		mov	[%1_dispatched], mbin_rsi
2726df3ef80SGreg Tucker		pop	mbin_rsi
2736df3ef80SGreg Tucker		ret
2746df3ef80SGreg Tucker%endmacro
2756df3ef80SGreg Tucker
2768dc5d913SGreg Tucker%else
2778dc5d913SGreg Tucker%macro mbin_dispatch_init6 6
2788dc5d913SGreg Tucker	mbin_dispatch_init5 %1, %2, %3, %4, %5
2798dc5d913SGreg Tucker%endmacro
2808dc5d913SGreg Tucker%endif
2818dc5d913SGreg Tucker
2828dc5d913SGreg Tucker%if AS_FEATURE_LEVEL >= 10
2838dc5d913SGreg Tucker;;;;;
2848dc5d913SGreg Tucker; mbin_dispatch_init7 parameters
2858dc5d913SGreg Tucker; 1-> function name
2868dc5d913SGreg Tucker; 2-> base function
2878dc5d913SGreg Tucker; 3-> SSE4_2 or 00/01 optimized function
2888dc5d913SGreg Tucker; 4-> AVX/02 opt func
2898dc5d913SGreg Tucker; 5-> AVX2/04 opt func
2908dc5d913SGreg Tucker; 6-> AVX512/06 opt func
2918dc5d913SGreg Tucker; 7-> AVX512 Update/10 opt func
2928dc5d913SGreg Tucker;;;;;
2938dc5d913SGreg Tucker%macro mbin_dispatch_init7 7
2948dc5d913SGreg Tucker	section .text
2958dc5d913SGreg Tucker	%1_dispatch_init:
2968dc5d913SGreg Tucker		push	mbin_rsi
2978dc5d913SGreg Tucker		push	mbin_rax
2988dc5d913SGreg Tucker		push	mbin_rbx
2998dc5d913SGreg Tucker		push	mbin_rcx
3008dc5d913SGreg Tucker		push	mbin_rdx
3018dc5d913SGreg Tucker		push	mbin_rdi
3028dc5d913SGreg Tucker		lea	mbin_rsi, [%2 WRT_OPT] ; Default - use base function
3038dc5d913SGreg Tucker
3048dc5d913SGreg Tucker		mov	eax, 1
3058dc5d913SGreg Tucker		cpuid
3068dc5d913SGreg Tucker		mov	ebx, ecx ; save cpuid1.ecx
3078dc5d913SGreg Tucker		test	ecx, FLAG_CPUID1_ECX_SSE4_2
3088dc5d913SGreg Tucker		je	_%1_init_done	  ; Use base function if no SSE4_2
3098dc5d913SGreg Tucker		lea	mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
3108dc5d913SGreg Tucker
3118dc5d913SGreg Tucker		;; Test for XMM_YMM support/AVX
3128dc5d913SGreg Tucker		test	ecx, FLAG_CPUID1_ECX_OSXSAVE
3138dc5d913SGreg Tucker		je	_%1_init_done
3148dc5d913SGreg Tucker		xor	ecx, ecx
3158dc5d913SGreg Tucker		xgetbv	; xcr -> edx:eax
3168dc5d913SGreg Tucker		mov	edi, eax	  ; save xgetvb.eax
3178dc5d913SGreg Tucker
3188dc5d913SGreg Tucker		and	eax, FLAG_XGETBV_EAX_XMM_YMM
3198dc5d913SGreg Tucker		cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
3208dc5d913SGreg Tucker		jne	_%1_init_done
3218dc5d913SGreg Tucker		test	ebx, FLAG_CPUID1_ECX_AVX
3228dc5d913SGreg Tucker		je	_%1_init_done
3238dc5d913SGreg Tucker		lea	mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt
3248dc5d913SGreg Tucker
3258dc5d913SGreg Tucker		;; Test for AVX2
3268dc5d913SGreg Tucker		xor	ecx, ecx
3278dc5d913SGreg Tucker		mov	eax, 7
3288dc5d913SGreg Tucker		cpuid
3298dc5d913SGreg Tucker		test	ebx, FLAG_CPUID7_EBX_AVX2
3308dc5d913SGreg Tucker		je	_%1_init_done		; No AVX2 possible
3318dc5d913SGreg Tucker		lea	mbin_rsi, [%5 WRT_OPT] 	; AVX2/04 opt func
3328dc5d913SGreg Tucker
3338dc5d913SGreg Tucker		;; Test for AVX512
3348dc5d913SGreg Tucker		and	edi, FLAG_XGETBV_EAX_ZMM_OPM
3358dc5d913SGreg Tucker		cmp	edi, FLAG_XGETBV_EAX_ZMM_OPM
3368dc5d913SGreg Tucker		jne	_%1_init_done	  ; No AVX512 possible
337dcce8ecbSGreg Tucker		and	ebx, FLAGS_CPUID7_EBX_AVX512_G1
338dcce8ecbSGreg Tucker		cmp	ebx, FLAGS_CPUID7_EBX_AVX512_G1
3398dc5d913SGreg Tucker		lea	mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt
3408dc5d913SGreg Tucker		cmove	mbin_rsi, mbin_rbx
3418dc5d913SGreg Tucker
3428dc5d913SGreg Tucker		and	ecx, FLAGS_CPUID7_ECX_AVX512_G2
3438dc5d913SGreg Tucker		cmp	ecx, FLAGS_CPUID7_ECX_AVX512_G2
3448dc5d913SGreg Tucker		lea	mbin_rbx, [%7 WRT_OPT] ; AVX512/06 opt
3458dc5d913SGreg Tucker		cmove	mbin_rsi, mbin_rbx
3468dc5d913SGreg Tucker
3478dc5d913SGreg Tucker	_%1_init_done:
3488dc5d913SGreg Tucker		pop	mbin_rdi
3498dc5d913SGreg Tucker		pop	mbin_rdx
3508dc5d913SGreg Tucker		pop	mbin_rcx
3518dc5d913SGreg Tucker		pop	mbin_rbx
3528dc5d913SGreg Tucker		pop	mbin_rax
3538dc5d913SGreg Tucker		mov	[%1_dispatched], mbin_rsi
3548dc5d913SGreg Tucker		pop	mbin_rsi
3558dc5d913SGreg Tucker		ret
3568dc5d913SGreg Tucker%endmacro
3578dc5d913SGreg Tucker%else
3588dc5d913SGreg Tucker%macro mbin_dispatch_init7 7
3598dc5d913SGreg Tucker	mbin_dispatch_init6 %1, %2, %3, %4, %5, %6
3608dc5d913SGreg Tucker%endmacro
3618dc5d913SGreg Tucker%endif
3628dc5d913SGreg Tucker
36330604006SXiaodong Liu;;;;;
36430604006SXiaodong Liu; mbin_dispatch_sse_to_avx2_shani parameters
36530604006SXiaodong Liu; derived from mbin_dispatch_init
36630604006SXiaodong Liu; Use this function when SSE/00/01 is a minimum requirement
36730604006SXiaodong Liu; 1-> function name
36830604006SXiaodong Liu; 2-> SSE/00/01 optimized function used as base
36930604006SXiaodong Liu; 3-> AVX or AVX/02 opt func
37030604006SXiaodong Liu; 4-> AVX2 or AVX/04 opt func
37130604006SXiaodong Liu; 5-> SHANI opt for GLM
37230604006SXiaodong Liu;;;;;
37330604006SXiaodong Liu%macro mbin_dispatch_sse_to_avx2_shani 5
37430604006SXiaodong Liu	section .text
37530604006SXiaodong Liu	%1_dispatch_init:
37630604006SXiaodong Liu		push	mbin_rsi
37730604006SXiaodong Liu		push	mbin_rax
37830604006SXiaodong Liu		push	mbin_rbx
37930604006SXiaodong Liu		push	mbin_rcx
38030604006SXiaodong Liu		push	mbin_rdx
38130604006SXiaodong Liu		lea	mbin_rsi, [%2 WRT_OPT] ; Default to SSE 00/01
38230604006SXiaodong Liu
38330604006SXiaodong Liu		mov	eax, 1
38430604006SXiaodong Liu		cpuid
38530604006SXiaodong Liu		and	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
38630604006SXiaodong Liu		cmp	ecx, (FLAG_CPUID1_ECX_AVX | FLAG_CPUID1_ECX_OSXSAVE)
38730604006SXiaodong Liu		lea	mbin_rbx, [%3 WRT_OPT] ; AVX (gen2) opt func
38830604006SXiaodong Liu		jne	_%1_shani_check ; AVX is not available so check shani
38930604006SXiaodong Liu		mov	mbin_rsi, mbin_rbx
39030604006SXiaodong Liu
39130604006SXiaodong Liu		;; Try for AVX2
39230604006SXiaodong Liu		xor	ecx, ecx
39330604006SXiaodong Liu		mov	eax, 7
39430604006SXiaodong Liu		cpuid
39530604006SXiaodong Liu		test	ebx, FLAG_CPUID7_EBX_AVX2
39630604006SXiaodong Liu		lea	mbin_rbx, [%4 WRT_OPT] ; AVX (gen4) opt func
39730604006SXiaodong Liu		cmovne	mbin_rsi, mbin_rbx
39830604006SXiaodong Liu
39930604006SXiaodong Liu		;; Does it have xmm and ymm support
40030604006SXiaodong Liu		xor	ecx, ecx
40130604006SXiaodong Liu		xgetbv
40230604006SXiaodong Liu		and	eax, FLAG_XGETBV_EAX_XMM_YMM
40330604006SXiaodong Liu		cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
40430604006SXiaodong Liu		je	_%1_init_done
40530604006SXiaodong Liu		lea	mbin_rsi, [%2 WRT_OPT]
40630604006SXiaodong Liu
40730604006SXiaodong Liu	_%1_init_done:
40830604006SXiaodong Liu		pop	mbin_rdx
40930604006SXiaodong Liu		pop	mbin_rcx
41030604006SXiaodong Liu		pop	mbin_rbx
41130604006SXiaodong Liu		pop	mbin_rax
41230604006SXiaodong Liu		mov	[%1_dispatched], mbin_rsi
41330604006SXiaodong Liu		pop	mbin_rsi
41430604006SXiaodong Liu		ret
41530604006SXiaodong Liu
41630604006SXiaodong Liu	_%1_shani_check:
41730604006SXiaodong Liu		xor	ecx, ecx
41830604006SXiaodong Liu		mov	eax, 7
41930604006SXiaodong Liu		cpuid
42030604006SXiaodong Liu		test	ebx, FLAG_CPUID7_EBX_SHA
42130604006SXiaodong Liu		lea	mbin_rbx, [%5 WRT_OPT] ; SHANI opt func
42230604006SXiaodong Liu		cmovne	mbin_rsi, mbin_rbx
42330604006SXiaodong Liu		jmp	_%1_init_done ; end
42430604006SXiaodong Liu%endmacro
42530604006SXiaodong Liu
42630604006SXiaodong Liu;;;;;
42730604006SXiaodong Liu; mbin_dispatch_base_to_avx512_shani parameters
42830604006SXiaodong Liu; derived from mbin_dispatch_init6
42930604006SXiaodong Liu; 1-> function name
43030604006SXiaodong Liu; 2-> base function
43130604006SXiaodong Liu; 3-> SSE4_2 or 00/01 optimized function
43230604006SXiaodong Liu; 4-> AVX/02 opt func
43330604006SXiaodong Liu; 5-> AVX2/04 opt func
43430604006SXiaodong Liu; 6-> AVX512/06 opt func
43530604006SXiaodong Liu; 7-> SHANI opt for GLM
43630604006SXiaodong Liu; 8-> SHANI opt for CNL
43730604006SXiaodong Liu;;;;;
43830604006SXiaodong Liu%macro mbin_dispatch_base_to_avx512_shani 8
43930604006SXiaodong Liu	section .text
44030604006SXiaodong Liu	%1_dispatch_init:
44130604006SXiaodong Liu		push	mbin_rsi
44230604006SXiaodong Liu		push	mbin_rax
44330604006SXiaodong Liu		push	mbin_rbx
44430604006SXiaodong Liu		push	mbin_rcx
44530604006SXiaodong Liu		push	mbin_rdx
44630604006SXiaodong Liu		push	mbin_rdi
44730604006SXiaodong Liu		lea	mbin_rsi, [%2 WRT_OPT] ; Default - use base function
44830604006SXiaodong Liu
44930604006SXiaodong Liu		mov	eax, 1
45030604006SXiaodong Liu		cpuid
45130604006SXiaodong Liu		mov	ebx, ecx ; save cpuid1.ecx
45230604006SXiaodong Liu		test	ecx, FLAG_CPUID1_ECX_SSE4_2
45330604006SXiaodong Liu		je	_%1_init_done	  ; Use base function if no SSE4_2
45430604006SXiaodong Liu		lea	mbin_rsi, [%3 WRT_OPT] ; SSE possible so use 00/01 opt
45530604006SXiaodong Liu
45630604006SXiaodong Liu		;; Test for XMM_YMM support/AVX
45730604006SXiaodong Liu		test	ecx, FLAG_CPUID1_ECX_OSXSAVE
45830604006SXiaodong Liu		je	_%1_shani_check
45930604006SXiaodong Liu		xor	ecx, ecx
46030604006SXiaodong Liu		xgetbv	; xcr -> edx:eax
46130604006SXiaodong Liu		mov	edi, eax	  ; save xgetvb.eax
46230604006SXiaodong Liu
46330604006SXiaodong Liu		and	eax, FLAG_XGETBV_EAX_XMM_YMM
46430604006SXiaodong Liu		cmp	eax, FLAG_XGETBV_EAX_XMM_YMM
46530604006SXiaodong Liu		jne	_%1_shani_check
46630604006SXiaodong Liu		test	ebx, FLAG_CPUID1_ECX_AVX
46730604006SXiaodong Liu		je	_%1_shani_check
46830604006SXiaodong Liu		lea	mbin_rsi, [%4 WRT_OPT] ; AVX/02 opt
46930604006SXiaodong Liu
47030604006SXiaodong Liu		;; Test for AVX2
47130604006SXiaodong Liu		xor	ecx, ecx
47230604006SXiaodong Liu		mov	eax, 7
47330604006SXiaodong Liu		cpuid
47430604006SXiaodong Liu		test	ebx, FLAG_CPUID7_EBX_AVX2
47530604006SXiaodong Liu		je	_%1_init_done		; No AVX2 possible
47630604006SXiaodong Liu		lea	mbin_rsi, [%5 WRT_OPT] 	; AVX2/04 opt func
47730604006SXiaodong Liu
47830604006SXiaodong Liu		;; Test for AVX512
47930604006SXiaodong Liu		and	edi, FLAG_XGETBV_EAX_ZMM_OPM
48030604006SXiaodong Liu		cmp	edi, FLAG_XGETBV_EAX_ZMM_OPM
48130604006SXiaodong Liu		jne	_%1_init_done	  ; No AVX512 possible
482dcce8ecbSGreg Tucker		and	ebx, FLAGS_CPUID7_EBX_AVX512_G1
483dcce8ecbSGreg Tucker		cmp	ebx, FLAGS_CPUID7_EBX_AVX512_G1
48430604006SXiaodong Liu		lea	mbin_rbx, [%6 WRT_OPT] ; AVX512/06 opt
48530604006SXiaodong Liu		cmove	mbin_rsi, mbin_rbx
48630604006SXiaodong Liu
48730604006SXiaodong Liu		;; Test for SHANI
48830604006SXiaodong Liu		xor	ecx, ecx
48930604006SXiaodong Liu		mov	eax, 7
49030604006SXiaodong Liu		cpuid
49130604006SXiaodong Liu		test	ebx, FLAG_CPUID7_EBX_SHA
49230604006SXiaodong Liu		lea	mbin_rbx, [%8 WRT_OPT] ; SHANI opt sse func
49330604006SXiaodong Liu		cmovne	mbin_rsi, mbin_rbx
49430604006SXiaodong Liu
49530604006SXiaodong Liu	_%1_init_done:
49630604006SXiaodong Liu		pop	mbin_rdi
49730604006SXiaodong Liu		pop	mbin_rdx
49830604006SXiaodong Liu		pop	mbin_rcx
49930604006SXiaodong Liu		pop	mbin_rbx
50030604006SXiaodong Liu		pop	mbin_rax
50130604006SXiaodong Liu		mov	[%1_dispatched], mbin_rsi
50230604006SXiaodong Liu		pop	mbin_rsi
50330604006SXiaodong Liu		ret
50430604006SXiaodong Liu
50530604006SXiaodong Liu	_%1_shani_check:
50630604006SXiaodong Liu		xor	ecx, ecx
50730604006SXiaodong Liu		mov	eax, 7
50830604006SXiaodong Liu		cpuid
50930604006SXiaodong Liu		test	ebx, FLAG_CPUID7_EBX_SHA
51030604006SXiaodong Liu		lea	mbin_rbx, [%7 WRT_OPT] ; SHANI opt sse func
51130604006SXiaodong Liu		cmovne	mbin_rsi, mbin_rbx
51230604006SXiaodong Liu		jmp	_%1_init_done ; end
51330604006SXiaodong Liu%endmacro
51430604006SXiaodong Liu
51530604006SXiaodong Liu
51630604006SXiaodong Liu
5176df3ef80SGreg Tucker%endif ; ifndef _MULTIBINARY_ASM_
518