xref: /llvm-project/llvm/test/CodeGen/X86/vector-bitreverse.ll (revision 92f9f014015554c5dd18df4699765cc42853a04d)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=ALL,SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=ALL,SSE,SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512F
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512BW
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX,AVX512,AVX512BW
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=ALL,XOP,XOPAVX1
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=ALL,XOP,XOPAVX2
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3,+gfni | FileCheck %s --check-prefixes=ALL,GFNISSE
12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+gfni | FileCheck %s --check-prefixes=ALL,GFNIAVX,GFNIAVX1
13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+gfni | FileCheck %s --check-prefixes=ALL,GFNIAVX,GFNIAVX2
14; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+gfni | FileCheck %s --check-prefixes=ALL,GFNIAVX,GFNIAVX512,GFNIAVX512F
15; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+gfni | FileCheck %s --check-prefixes=ALL,GFNIAVX,GFNIAVX512,GFNIAVX512BW
16
17; Make sure we don't crash with avx512bw and xop
18; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx512bw
19
20define i8 @test_bitreverse_i8(i8 %a) nounwind {
21; SSE-LABEL: test_bitreverse_i8:
22; SSE:       # %bb.0:
23; SSE-NEXT:    rolb $4, %dil
24; SSE-NEXT:    movl %edi, %eax
25; SSE-NEXT:    andb $51, %al
26; SSE-NEXT:    shlb $2, %al
27; SSE-NEXT:    shrb $2, %dil
28; SSE-NEXT:    andb $51, %dil
29; SSE-NEXT:    orb %dil, %al
30; SSE-NEXT:    movl %eax, %ecx
31; SSE-NEXT:    andb $85, %cl
32; SSE-NEXT:    addb %cl, %cl
33; SSE-NEXT:    shrb %al
34; SSE-NEXT:    andb $85, %al
35; SSE-NEXT:    orb %cl, %al
36; SSE-NEXT:    retq
37;
38; AVX-LABEL: test_bitreverse_i8:
39; AVX:       # %bb.0:
40; AVX-NEXT:    rolb $4, %dil
41; AVX-NEXT:    movl %edi, %eax
42; AVX-NEXT:    andb $51, %al
43; AVX-NEXT:    shlb $2, %al
44; AVX-NEXT:    shrb $2, %dil
45; AVX-NEXT:    andb $51, %dil
46; AVX-NEXT:    orb %dil, %al
47; AVX-NEXT:    movl %eax, %ecx
48; AVX-NEXT:    andb $85, %cl
49; AVX-NEXT:    addb %cl, %cl
50; AVX-NEXT:    shrb %al
51; AVX-NEXT:    andb $85, %al
52; AVX-NEXT:    orb %cl, %al
53; AVX-NEXT:    retq
54;
55; XOP-LABEL: test_bitreverse_i8:
56; XOP:       # %bb.0:
57; XOP-NEXT:    vmovd %edi, %xmm0
58; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
59; XOP-NEXT:    vmovd %xmm0, %eax
60; XOP-NEXT:    # kill: def $al killed $al killed $eax
61; XOP-NEXT:    retq
62;
63; GFNISSE-LABEL: test_bitreverse_i8:
64; GFNISSE:       # %bb.0:
65; GFNISSE-NEXT:    movd %edi, %xmm0
66; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
67; GFNISSE-NEXT:    movd %xmm0, %eax
68; GFNISSE-NEXT:    # kill: def $al killed $al killed $eax
69; GFNISSE-NEXT:    retq
70;
71; GFNIAVX-LABEL: test_bitreverse_i8:
72; GFNIAVX:       # %bb.0:
73; GFNIAVX-NEXT:    vmovd %edi, %xmm0
74; GFNIAVX-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
75; GFNIAVX-NEXT:    vmovd %xmm0, %eax
76; GFNIAVX-NEXT:    # kill: def $al killed $al killed $eax
77; GFNIAVX-NEXT:    retq
78  %b = call i8 @llvm.bitreverse.i8(i8 %a)
79  ret i8 %b
80}
81
82define i16 @test_bitreverse_i16(i16 %a) nounwind {
83; SSE-LABEL: test_bitreverse_i16:
84; SSE:       # %bb.0:
85; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
86; SSE-NEXT:    rolw $8, %di
87; SSE-NEXT:    movl %edi, %eax
88; SSE-NEXT:    andl $3855, %eax # imm = 0xF0F
89; SSE-NEXT:    shll $4, %eax
90; SSE-NEXT:    shrl $4, %edi
91; SSE-NEXT:    andl $3855, %edi # imm = 0xF0F
92; SSE-NEXT:    orl %eax, %edi
93; SSE-NEXT:    movl %edi, %eax
94; SSE-NEXT:    andl $13107, %eax # imm = 0x3333
95; SSE-NEXT:    shrl $2, %edi
96; SSE-NEXT:    andl $13107, %edi # imm = 0x3333
97; SSE-NEXT:    leal (%rdi,%rax,4), %eax
98; SSE-NEXT:    movl %eax, %ecx
99; SSE-NEXT:    andl $21845, %ecx # imm = 0x5555
100; SSE-NEXT:    shrl %eax
101; SSE-NEXT:    andl $21845, %eax # imm = 0x5555
102; SSE-NEXT:    leal (%rax,%rcx,2), %eax
103; SSE-NEXT:    # kill: def $ax killed $ax killed $eax
104; SSE-NEXT:    retq
105;
106; AVX-LABEL: test_bitreverse_i16:
107; AVX:       # %bb.0:
108; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
109; AVX-NEXT:    rolw $8, %di
110; AVX-NEXT:    movl %edi, %eax
111; AVX-NEXT:    andl $3855, %eax # imm = 0xF0F
112; AVX-NEXT:    shll $4, %eax
113; AVX-NEXT:    shrl $4, %edi
114; AVX-NEXT:    andl $3855, %edi # imm = 0xF0F
115; AVX-NEXT:    orl %eax, %edi
116; AVX-NEXT:    movl %edi, %eax
117; AVX-NEXT:    andl $13107, %eax # imm = 0x3333
118; AVX-NEXT:    shrl $2, %edi
119; AVX-NEXT:    andl $13107, %edi # imm = 0x3333
120; AVX-NEXT:    leal (%rdi,%rax,4), %eax
121; AVX-NEXT:    movl %eax, %ecx
122; AVX-NEXT:    andl $21845, %ecx # imm = 0x5555
123; AVX-NEXT:    shrl %eax
124; AVX-NEXT:    andl $21845, %eax # imm = 0x5555
125; AVX-NEXT:    leal (%rax,%rcx,2), %eax
126; AVX-NEXT:    # kill: def $ax killed $ax killed $eax
127; AVX-NEXT:    retq
128;
129; XOP-LABEL: test_bitreverse_i16:
130; XOP:       # %bb.0:
131; XOP-NEXT:    vmovd %edi, %xmm0
132; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
133; XOP-NEXT:    vmovd %xmm0, %eax
134; XOP-NEXT:    # kill: def $ax killed $ax killed $eax
135; XOP-NEXT:    retq
136;
137; GFNISSE-LABEL: test_bitreverse_i16:
138; GFNISSE:       # %bb.0:
139; GFNISSE-NEXT:    movd %edi, %xmm0
140; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
141; GFNISSE-NEXT:    movd %xmm0, %eax
142; GFNISSE-NEXT:    rolw $8, %ax
143; GFNISSE-NEXT:    # kill: def $ax killed $ax killed $eax
144; GFNISSE-NEXT:    retq
145;
146; GFNIAVX-LABEL: test_bitreverse_i16:
147; GFNIAVX:       # %bb.0:
148; GFNIAVX-NEXT:    vmovd %edi, %xmm0
149; GFNIAVX-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
150; GFNIAVX-NEXT:    vmovd %xmm0, %eax
151; GFNIAVX-NEXT:    rolw $8, %ax
152; GFNIAVX-NEXT:    # kill: def $ax killed $ax killed $eax
153; GFNIAVX-NEXT:    retq
154  %b = call i16 @llvm.bitreverse.i16(i16 %a)
155  ret i16 %b
156}
157
158define i32 @test_bitreverse_i32(i32 %a) nounwind {
159; SSE-LABEL: test_bitreverse_i32:
160; SSE:       # %bb.0:
161; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
162; SSE-NEXT:    bswapl %edi
163; SSE-NEXT:    movl %edi, %eax
164; SSE-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
165; SSE-NEXT:    shll $4, %eax
166; SSE-NEXT:    shrl $4, %edi
167; SSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
168; SSE-NEXT:    orl %eax, %edi
169; SSE-NEXT:    movl %edi, %eax
170; SSE-NEXT:    andl $858993459, %eax # imm = 0x33333333
171; SSE-NEXT:    shrl $2, %edi
172; SSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
173; SSE-NEXT:    leal (%rdi,%rax,4), %eax
174; SSE-NEXT:    movl %eax, %ecx
175; SSE-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
176; SSE-NEXT:    shrl %eax
177; SSE-NEXT:    andl $1431655765, %eax # imm = 0x55555555
178; SSE-NEXT:    leal (%rax,%rcx,2), %eax
179; SSE-NEXT:    retq
180;
181; AVX-LABEL: test_bitreverse_i32:
182; AVX:       # %bb.0:
183; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
184; AVX-NEXT:    bswapl %edi
185; AVX-NEXT:    movl %edi, %eax
186; AVX-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
187; AVX-NEXT:    shll $4, %eax
188; AVX-NEXT:    shrl $4, %edi
189; AVX-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
190; AVX-NEXT:    orl %eax, %edi
191; AVX-NEXT:    movl %edi, %eax
192; AVX-NEXT:    andl $858993459, %eax # imm = 0x33333333
193; AVX-NEXT:    shrl $2, %edi
194; AVX-NEXT:    andl $858993459, %edi # imm = 0x33333333
195; AVX-NEXT:    leal (%rdi,%rax,4), %eax
196; AVX-NEXT:    movl %eax, %ecx
197; AVX-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
198; AVX-NEXT:    shrl %eax
199; AVX-NEXT:    andl $1431655765, %eax # imm = 0x55555555
200; AVX-NEXT:    leal (%rax,%rcx,2), %eax
201; AVX-NEXT:    retq
202;
203; XOP-LABEL: test_bitreverse_i32:
204; XOP:       # %bb.0:
205; XOP-NEXT:    vmovd %edi, %xmm0
206; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
207; XOP-NEXT:    vmovd %xmm0, %eax
208; XOP-NEXT:    retq
209;
210; GFNISSE-LABEL: test_bitreverse_i32:
211; GFNISSE:       # %bb.0:
212; GFNISSE-NEXT:    movd %edi, %xmm0
213; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
214; GFNISSE-NEXT:    movd %xmm0, %eax
215; GFNISSE-NEXT:    bswapl %eax
216; GFNISSE-NEXT:    retq
217;
218; GFNIAVX-LABEL: test_bitreverse_i32:
219; GFNIAVX:       # %bb.0:
220; GFNIAVX-NEXT:    vmovd %edi, %xmm0
221; GFNIAVX-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
222; GFNIAVX-NEXT:    vmovd %xmm0, %eax
223; GFNIAVX-NEXT:    bswapl %eax
224; GFNIAVX-NEXT:    retq
225  %b = call i32 @llvm.bitreverse.i32(i32 %a)
226  ret i32 %b
227}
228
229define i64 @test_bitreverse_i64(i64 %a) nounwind {
230; SSE-LABEL: test_bitreverse_i64:
231; SSE:       # %bb.0:
232; SSE-NEXT:    bswapq %rdi
233; SSE-NEXT:    movq %rdi, %rax
234; SSE-NEXT:    shrq $4, %rax
235; SSE-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
236; SSE-NEXT:    andq %rcx, %rax
237; SSE-NEXT:    andq %rcx, %rdi
238; SSE-NEXT:    shlq $4, %rdi
239; SSE-NEXT:    orq %rax, %rdi
240; SSE-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
241; SSE-NEXT:    movq %rdi, %rcx
242; SSE-NEXT:    andq %rax, %rcx
243; SSE-NEXT:    shrq $2, %rdi
244; SSE-NEXT:    andq %rax, %rdi
245; SSE-NEXT:    leaq (%rdi,%rcx,4), %rax
246; SSE-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
247; SSE-NEXT:    movq %rax, %rdx
248; SSE-NEXT:    andq %rcx, %rdx
249; SSE-NEXT:    shrq %rax
250; SSE-NEXT:    andq %rcx, %rax
251; SSE-NEXT:    leaq (%rax,%rdx,2), %rax
252; SSE-NEXT:    retq
253;
254; AVX-LABEL: test_bitreverse_i64:
255; AVX:       # %bb.0:
256; AVX-NEXT:    bswapq %rdi
257; AVX-NEXT:    movq %rdi, %rax
258; AVX-NEXT:    shrq $4, %rax
259; AVX-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
260; AVX-NEXT:    andq %rcx, %rax
261; AVX-NEXT:    andq %rcx, %rdi
262; AVX-NEXT:    shlq $4, %rdi
263; AVX-NEXT:    orq %rax, %rdi
264; AVX-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
265; AVX-NEXT:    movq %rdi, %rcx
266; AVX-NEXT:    andq %rax, %rcx
267; AVX-NEXT:    shrq $2, %rdi
268; AVX-NEXT:    andq %rax, %rdi
269; AVX-NEXT:    leaq (%rdi,%rcx,4), %rax
270; AVX-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
271; AVX-NEXT:    movq %rax, %rdx
272; AVX-NEXT:    andq %rcx, %rdx
273; AVX-NEXT:    shrq %rax
274; AVX-NEXT:    andq %rcx, %rax
275; AVX-NEXT:    leaq (%rax,%rdx,2), %rax
276; AVX-NEXT:    retq
277;
278; XOP-LABEL: test_bitreverse_i64:
279; XOP:       # %bb.0:
280; XOP-NEXT:    vmovq %rdi, %xmm0
281; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
282; XOP-NEXT:    vmovq %xmm0, %rax
283; XOP-NEXT:    retq
284;
285; GFNISSE-LABEL: test_bitreverse_i64:
286; GFNISSE:       # %bb.0:
287; GFNISSE-NEXT:    movq %rdi, %xmm0
288; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
289; GFNISSE-NEXT:    movq %xmm0, %rax
290; GFNISSE-NEXT:    bswapq %rax
291; GFNISSE-NEXT:    retq
292;
293; GFNIAVX-LABEL: test_bitreverse_i64:
294; GFNIAVX:       # %bb.0:
295; GFNIAVX-NEXT:    vmovq %rdi, %xmm0
296; GFNIAVX-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
297; GFNIAVX-NEXT:    vmovq %xmm0, %rax
298; GFNIAVX-NEXT:    bswapq %rax
299; GFNIAVX-NEXT:    retq
300  %b = call i64 @llvm.bitreverse.i64(i64 %a)
301  ret i64 %b
302}
303
304define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind {
305; SSE2-LABEL: test_bitreverse_v16i8:
306; SSE2:       # %bb.0:
307; SSE2-NEXT:    movdqa %xmm0, %xmm1
308; SSE2-NEXT:    psrlw $4, %xmm1
309; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
310; SSE2-NEXT:    pand %xmm2, %xmm1
311; SSE2-NEXT:    pand %xmm2, %xmm0
312; SSE2-NEXT:    psllw $4, %xmm0
313; SSE2-NEXT:    por %xmm1, %xmm0
314; SSE2-NEXT:    movdqa %xmm0, %xmm1
315; SSE2-NEXT:    psrlw $2, %xmm1
316; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
317; SSE2-NEXT:    pand %xmm2, %xmm1
318; SSE2-NEXT:    pand %xmm2, %xmm0
319; SSE2-NEXT:    psllw $2, %xmm0
320; SSE2-NEXT:    por %xmm1, %xmm0
321; SSE2-NEXT:    movdqa %xmm0, %xmm1
322; SSE2-NEXT:    psrlw $1, %xmm1
323; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
324; SSE2-NEXT:    pand %xmm2, %xmm1
325; SSE2-NEXT:    pand %xmm2, %xmm0
326; SSE2-NEXT:    paddb %xmm0, %xmm0
327; SSE2-NEXT:    por %xmm1, %xmm0
328; SSE2-NEXT:    retq
329;
330; SSSE3-LABEL: test_bitreverse_v16i8:
331; SSSE3:       # %bb.0:
332; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
333; SSSE3-NEXT:    movdqa %xmm0, %xmm2
334; SSSE3-NEXT:    pand %xmm1, %xmm2
335; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
336; SSSE3-NEXT:    pshufb %xmm2, %xmm3
337; SSSE3-NEXT:    psrlw $4, %xmm0
338; SSSE3-NEXT:    pand %xmm1, %xmm0
339; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
340; SSSE3-NEXT:    pshufb %xmm0, %xmm1
341; SSSE3-NEXT:    por %xmm3, %xmm1
342; SSSE3-NEXT:    movdqa %xmm1, %xmm0
343; SSSE3-NEXT:    retq
344;
345; AVX1-LABEL: test_bitreverse_v16i8:
346; AVX1:       # %bb.0:
347; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
348; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
349; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
350; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
351; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
352; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
353; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
354; AVX1-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
355; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
356; AVX1-NEXT:    retq
357;
358; AVX2-LABEL: test_bitreverse_v16i8:
359; AVX2:       # %bb.0:
360; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
361; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
362; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
363; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
364; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
365; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
366; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
367; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
368; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
369; AVX2-NEXT:    retq
370;
371; AVX512-LABEL: test_bitreverse_v16i8:
372; AVX512:       # %bb.0:
373; AVX512-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
374; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm2
375; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
376; AVX512-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
377; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm0
378; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
379; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
380; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
381; AVX512-NEXT:    vpor %xmm0, %xmm2, %xmm0
382; AVX512-NEXT:    retq
383;
384; XOP-LABEL: test_bitreverse_v16i8:
385; XOP:       # %bb.0:
386; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
387; XOP-NEXT:    retq
388;
389; GFNISSE-LABEL: test_bitreverse_v16i8:
390; GFNISSE:       # %bb.0:
391; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
392; GFNISSE-NEXT:    retq
393;
394; GFNIAVX-LABEL: test_bitreverse_v16i8:
395; GFNIAVX:       # %bb.0:
396; GFNIAVX-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
397; GFNIAVX-NEXT:    retq
398  %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
399  ret <16 x i8> %b
400}
401
402define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
403; SSE2-LABEL: test_bitreverse_v8i16:
404; SSE2:       # %bb.0:
405; SSE2-NEXT:    movdqa %xmm0, %xmm1
406; SSE2-NEXT:    psrlw $8, %xmm1
407; SSE2-NEXT:    psllw $8, %xmm0
408; SSE2-NEXT:    por %xmm1, %xmm0
409; SSE2-NEXT:    movdqa %xmm0, %xmm1
410; SSE2-NEXT:    psrlw $4, %xmm1
411; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
412; SSE2-NEXT:    pand %xmm2, %xmm1
413; SSE2-NEXT:    pand %xmm2, %xmm0
414; SSE2-NEXT:    psllw $4, %xmm0
415; SSE2-NEXT:    por %xmm1, %xmm0
416; SSE2-NEXT:    movdqa %xmm0, %xmm1
417; SSE2-NEXT:    psrlw $2, %xmm1
418; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
419; SSE2-NEXT:    pand %xmm2, %xmm1
420; SSE2-NEXT:    pand %xmm2, %xmm0
421; SSE2-NEXT:    psllw $2, %xmm0
422; SSE2-NEXT:    por %xmm1, %xmm0
423; SSE2-NEXT:    movdqa %xmm0, %xmm1
424; SSE2-NEXT:    psrlw $1, %xmm1
425; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
426; SSE2-NEXT:    pand %xmm2, %xmm1
427; SSE2-NEXT:    pand %xmm2, %xmm0
428; SSE2-NEXT:    paddb %xmm0, %xmm0
429; SSE2-NEXT:    por %xmm1, %xmm0
430; SSE2-NEXT:    retq
431;
432; SSSE3-LABEL: test_bitreverse_v8i16:
433; SSSE3:       # %bb.0:
434; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
435; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
436; SSSE3-NEXT:    movdqa %xmm0, %xmm2
437; SSSE3-NEXT:    pand %xmm1, %xmm2
438; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
439; SSSE3-NEXT:    pshufb %xmm2, %xmm3
440; SSSE3-NEXT:    psrlw $4, %xmm0
441; SSSE3-NEXT:    pand %xmm1, %xmm0
442; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
443; SSSE3-NEXT:    pshufb %xmm0, %xmm1
444; SSSE3-NEXT:    por %xmm3, %xmm1
445; SSSE3-NEXT:    movdqa %xmm1, %xmm0
446; SSSE3-NEXT:    retq
447;
448; AVX1-LABEL: test_bitreverse_v8i16:
449; AVX1:       # %bb.0:
450; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
451; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
452; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
453; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
454; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
455; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
456; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
457; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
458; AVX1-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
459; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
460; AVX1-NEXT:    retq
461;
462; AVX2-LABEL: test_bitreverse_v8i16:
463; AVX2:       # %bb.0:
464; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
465; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
466; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
467; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
468; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
469; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
470; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
471; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
472; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
473; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
474; AVX2-NEXT:    retq
475;
476; AVX512-LABEL: test_bitreverse_v8i16:
477; AVX512:       # %bb.0:
478; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
479; AVX512-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
480; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm2
481; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
482; AVX512-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
483; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm0
484; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
485; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
486; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
487; AVX512-NEXT:    vpor %xmm0, %xmm2, %xmm0
488; AVX512-NEXT:    retq
489;
490; XOP-LABEL: test_bitreverse_v8i16:
491; XOP:       # %bb.0:
492; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
493; XOP-NEXT:    retq
494;
495; GFNISSE-LABEL: test_bitreverse_v8i16:
496; GFNISSE:       # %bb.0:
497; GFNISSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
498; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
499; GFNISSE-NEXT:    retq
500;
501; GFNIAVX-LABEL: test_bitreverse_v8i16:
502; GFNIAVX:       # %bb.0:
503; GFNIAVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
504; GFNIAVX-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
505; GFNIAVX-NEXT:    retq
506  %b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
507  ret <8 x i16> %b
508}
509
510define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind {
511; SSE2-LABEL: test_bitreverse_v4i32:
512; SSE2:       # %bb.0:
513; SSE2-NEXT:    pxor %xmm1, %xmm1
514; SSE2-NEXT:    movdqa %xmm0, %xmm2
515; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
516; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
517; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
518; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
519; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
520; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
521; SSE2-NEXT:    packuswb %xmm2, %xmm0
522; SSE2-NEXT:    movdqa %xmm0, %xmm1
523; SSE2-NEXT:    psrlw $4, %xmm1
524; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
525; SSE2-NEXT:    pand %xmm2, %xmm1
526; SSE2-NEXT:    pand %xmm2, %xmm0
527; SSE2-NEXT:    psllw $4, %xmm0
528; SSE2-NEXT:    por %xmm1, %xmm0
529; SSE2-NEXT:    movdqa %xmm0, %xmm1
530; SSE2-NEXT:    psrlw $2, %xmm1
531; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
532; SSE2-NEXT:    pand %xmm2, %xmm1
533; SSE2-NEXT:    pand %xmm2, %xmm0
534; SSE2-NEXT:    psllw $2, %xmm0
535; SSE2-NEXT:    por %xmm1, %xmm0
536; SSE2-NEXT:    movdqa %xmm0, %xmm1
537; SSE2-NEXT:    psrlw $1, %xmm1
538; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
539; SSE2-NEXT:    pand %xmm2, %xmm1
540; SSE2-NEXT:    pand %xmm2, %xmm0
541; SSE2-NEXT:    paddb %xmm0, %xmm0
542; SSE2-NEXT:    por %xmm1, %xmm0
543; SSE2-NEXT:    retq
544;
545; SSSE3-LABEL: test_bitreverse_v4i32:
546; SSSE3:       # %bb.0:
547; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
548; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
549; SSSE3-NEXT:    movdqa %xmm0, %xmm2
550; SSSE3-NEXT:    pand %xmm1, %xmm2
551; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
552; SSSE3-NEXT:    pshufb %xmm2, %xmm3
553; SSSE3-NEXT:    psrlw $4, %xmm0
554; SSSE3-NEXT:    pand %xmm1, %xmm0
555; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
556; SSSE3-NEXT:    pshufb %xmm0, %xmm1
557; SSSE3-NEXT:    por %xmm3, %xmm1
558; SSSE3-NEXT:    movdqa %xmm1, %xmm0
559; SSSE3-NEXT:    retq
560;
561; AVX1-LABEL: test_bitreverse_v4i32:
562; AVX1:       # %bb.0:
563; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
564; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
565; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
566; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
567; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
568; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
569; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
570; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
571; AVX1-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
572; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
573; AVX1-NEXT:    retq
574;
575; AVX2-LABEL: test_bitreverse_v4i32:
576; AVX2:       # %bb.0:
577; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
578; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
579; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
580; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
581; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
582; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
583; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
584; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
585; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
586; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
587; AVX2-NEXT:    retq
588;
589; AVX512-LABEL: test_bitreverse_v4i32:
590; AVX512:       # %bb.0:
591; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
592; AVX512-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
593; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm2
594; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
595; AVX512-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
596; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm0
597; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
598; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
599; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
600; AVX512-NEXT:    vpor %xmm0, %xmm2, %xmm0
601; AVX512-NEXT:    retq
602;
603; XOP-LABEL: test_bitreverse_v4i32:
604; XOP:       # %bb.0:
605; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
606; XOP-NEXT:    retq
607;
608; GFNISSE-LABEL: test_bitreverse_v4i32:
609; GFNISSE:       # %bb.0:
610; GFNISSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
611; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
612; GFNISSE-NEXT:    retq
613;
614; GFNIAVX-LABEL: test_bitreverse_v4i32:
615; GFNIAVX:       # %bb.0:
616; GFNIAVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
617; GFNIAVX-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
618; GFNIAVX-NEXT:    retq
619  %b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
620  ret <4 x i32> %b
621}
622
623define <2 x i64> @test_bitreverse_v2i64(<2 x i64> %a) nounwind {
624; SSE2-LABEL: test_bitreverse_v2i64:
625; SSE2:       # %bb.0:
626; SSE2-NEXT:    pxor %xmm1, %xmm1
627; SSE2-NEXT:    movdqa %xmm0, %xmm2
628; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
629; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
630; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
631; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
632; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
633; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
634; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
635; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
636; SSE2-NEXT:    packuswb %xmm2, %xmm0
637; SSE2-NEXT:    movdqa %xmm0, %xmm1
638; SSE2-NEXT:    psrlw $4, %xmm1
639; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
640; SSE2-NEXT:    pand %xmm2, %xmm1
641; SSE2-NEXT:    pand %xmm2, %xmm0
642; SSE2-NEXT:    psllw $4, %xmm0
643; SSE2-NEXT:    por %xmm1, %xmm0
644; SSE2-NEXT:    movdqa %xmm0, %xmm1
645; SSE2-NEXT:    psrlw $2, %xmm1
646; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
647; SSE2-NEXT:    pand %xmm2, %xmm1
648; SSE2-NEXT:    pand %xmm2, %xmm0
649; SSE2-NEXT:    psllw $2, %xmm0
650; SSE2-NEXT:    por %xmm1, %xmm0
651; SSE2-NEXT:    movdqa %xmm0, %xmm1
652; SSE2-NEXT:    psrlw $1, %xmm1
653; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
654; SSE2-NEXT:    pand %xmm2, %xmm1
655; SSE2-NEXT:    pand %xmm2, %xmm0
656; SSE2-NEXT:    paddb %xmm0, %xmm0
657; SSE2-NEXT:    por %xmm1, %xmm0
658; SSE2-NEXT:    retq
659;
660; SSSE3-LABEL: test_bitreverse_v2i64:
661; SSSE3:       # %bb.0:
662; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
663; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
664; SSSE3-NEXT:    movdqa %xmm0, %xmm2
665; SSSE3-NEXT:    pand %xmm1, %xmm2
666; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
667; SSSE3-NEXT:    pshufb %xmm2, %xmm3
668; SSSE3-NEXT:    psrlw $4, %xmm0
669; SSSE3-NEXT:    pand %xmm1, %xmm0
670; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
671; SSSE3-NEXT:    pshufb %xmm0, %xmm1
672; SSSE3-NEXT:    por %xmm3, %xmm1
673; SSSE3-NEXT:    movdqa %xmm1, %xmm0
674; SSSE3-NEXT:    retq
675;
676; AVX1-LABEL: test_bitreverse_v2i64:
677; AVX1:       # %bb.0:
678; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
679; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
680; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
681; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
682; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
683; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
684; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
685; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
686; AVX1-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
687; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
688; AVX1-NEXT:    retq
689;
690; AVX2-LABEL: test_bitreverse_v2i64:
691; AVX2:       # %bb.0:
692; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
693; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
694; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
695; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
696; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
697; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
698; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
699; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
700; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
701; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
702; AVX2-NEXT:    retq
703;
704; AVX512-LABEL: test_bitreverse_v2i64:
705; AVX512:       # %bb.0:
706; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
707; AVX512-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
708; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm2
709; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
710; AVX512-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
711; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm0
712; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
713; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
714; AVX512-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
715; AVX512-NEXT:    vpor %xmm0, %xmm2, %xmm0
716; AVX512-NEXT:    retq
717;
718; XOP-LABEL: test_bitreverse_v2i64:
719; XOP:       # %bb.0:
720; XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0, %xmm0
721; XOP-NEXT:    retq
722;
723; GFNISSE-LABEL: test_bitreverse_v2i64:
724; GFNISSE:       # %bb.0:
725; GFNISSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
726; GFNISSE-NEXT:    gf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
727; GFNISSE-NEXT:    retq
728;
729; GFNIAVX-LABEL: test_bitreverse_v2i64:
730; GFNIAVX:       # %bb.0:
731; GFNIAVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
732; GFNIAVX-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
733; GFNIAVX-NEXT:    retq
734  %b = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %a)
735  ret <2 x i64> %b
736}
737
738define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
739; SSE2-LABEL: test_bitreverse_v32i8:
740; SSE2:       # %bb.0:
741; SSE2-NEXT:    movdqa %xmm0, %xmm3
742; SSE2-NEXT:    psrlw $4, %xmm3
743; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
744; SSE2-NEXT:    pand %xmm2, %xmm3
745; SSE2-NEXT:    pand %xmm2, %xmm0
746; SSE2-NEXT:    psllw $4, %xmm0
747; SSE2-NEXT:    por %xmm3, %xmm0
748; SSE2-NEXT:    movdqa %xmm0, %xmm4
749; SSE2-NEXT:    psrlw $2, %xmm4
750; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
751; SSE2-NEXT:    pand %xmm3, %xmm4
752; SSE2-NEXT:    pand %xmm3, %xmm0
753; SSE2-NEXT:    psllw $2, %xmm0
754; SSE2-NEXT:    por %xmm4, %xmm0
755; SSE2-NEXT:    movdqa %xmm0, %xmm5
756; SSE2-NEXT:    psrlw $1, %xmm5
757; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
758; SSE2-NEXT:    pand %xmm4, %xmm5
759; SSE2-NEXT:    pand %xmm4, %xmm0
760; SSE2-NEXT:    paddb %xmm0, %xmm0
761; SSE2-NEXT:    por %xmm5, %xmm0
762; SSE2-NEXT:    movdqa %xmm1, %xmm5
763; SSE2-NEXT:    psrlw $4, %xmm5
764; SSE2-NEXT:    pand %xmm2, %xmm5
765; SSE2-NEXT:    pand %xmm2, %xmm1
766; SSE2-NEXT:    psllw $4, %xmm1
767; SSE2-NEXT:    por %xmm5, %xmm1
768; SSE2-NEXT:    movdqa %xmm1, %xmm2
769; SSE2-NEXT:    psrlw $2, %xmm2
770; SSE2-NEXT:    pand %xmm3, %xmm2
771; SSE2-NEXT:    pand %xmm3, %xmm1
772; SSE2-NEXT:    psllw $2, %xmm1
773; SSE2-NEXT:    por %xmm2, %xmm1
774; SSE2-NEXT:    movdqa %xmm1, %xmm2
775; SSE2-NEXT:    psrlw $1, %xmm2
776; SSE2-NEXT:    pand %xmm4, %xmm2
777; SSE2-NEXT:    pand %xmm4, %xmm1
778; SSE2-NEXT:    paddb %xmm1, %xmm1
779; SSE2-NEXT:    por %xmm2, %xmm1
780; SSE2-NEXT:    retq
781;
782; SSSE3-LABEL: test_bitreverse_v32i8:
783; SSSE3:       # %bb.0:
784; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
785; SSSE3-NEXT:    movdqa %xmm0, %xmm2
786; SSSE3-NEXT:    pand %xmm4, %xmm2
787; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
788; SSSE3-NEXT:    movdqa %xmm5, %xmm6
789; SSSE3-NEXT:    pshufb %xmm2, %xmm6
790; SSSE3-NEXT:    psrlw $4, %xmm0
791; SSSE3-NEXT:    pand %xmm4, %xmm0
792; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
793; SSSE3-NEXT:    movdqa %xmm2, %xmm3
794; SSSE3-NEXT:    pshufb %xmm0, %xmm3
795; SSSE3-NEXT:    por %xmm6, %xmm3
796; SSSE3-NEXT:    movdqa %xmm1, %xmm0
797; SSSE3-NEXT:    pand %xmm4, %xmm0
798; SSSE3-NEXT:    pshufb %xmm0, %xmm5
799; SSSE3-NEXT:    psrlw $4, %xmm1
800; SSSE3-NEXT:    pand %xmm4, %xmm1
801; SSSE3-NEXT:    pshufb %xmm1, %xmm2
802; SSSE3-NEXT:    por %xmm5, %xmm2
803; SSSE3-NEXT:    movdqa %xmm3, %xmm0
804; SSSE3-NEXT:    movdqa %xmm2, %xmm1
805; SSSE3-NEXT:    retq
806;
807; AVX1-LABEL: test_bitreverse_v32i8:
808; AVX1:       # %bb.0:
809; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
810; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
811; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
812; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
813; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
814; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
815; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
816; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
817; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
818; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
819; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
820; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
821; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
822; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
823; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
824; AVX1-NEXT:    vpor %xmm0, %xmm3, %xmm0
825; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
826; AVX1-NEXT:    retq
827;
828; AVX2-LABEL: test_bitreverse_v32i8:
829; AVX2:       # %bb.0:
830; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
831; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
832; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
833; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
834; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
835; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
836; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
837; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
838; AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
839; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
840; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
841; AVX2-NEXT:    retq
842;
843; AVX512-LABEL: test_bitreverse_v32i8:
844; AVX512:       # %bb.0:
845; AVX512-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
846; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
847; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
848; AVX512-NEXT:    # ymm3 = mem[0,1,0,1]
849; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
850; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
851; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
852; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
853; AVX512-NEXT:    # ymm1 = mem[0,1,0,1]
854; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
855; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
856; AVX512-NEXT:    retq
857;
858; XOPAVX1-LABEL: test_bitreverse_v32i8:
859; XOPAVX1:       # %bb.0:
860; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
861; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
862; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
863; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
864; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
865; XOPAVX1-NEXT:    retq
866;
867; XOPAVX2-LABEL: test_bitreverse_v32i8:
868; XOPAVX2:       # %bb.0:
869; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
870; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
871; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
872; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
873; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
874; XOPAVX2-NEXT:    retq
875;
876; GFNISSE-LABEL: test_bitreverse_v32i8:
877; GFNISSE:       # %bb.0:
878; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
879; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm0
880; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm2, %xmm1
881; GFNISSE-NEXT:    retq
882;
883; GFNIAVX-LABEL: test_bitreverse_v32i8:
884; GFNIAVX:       # %bb.0:
885; GFNIAVX-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
886; GFNIAVX-NEXT:    retq
887  %b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
888  ret <32 x i8> %b
889}
890
891define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
892; SSE2-LABEL: test_bitreverse_v16i16:
893; SSE2:       # %bb.0:
894; SSE2-NEXT:    movdqa %xmm0, %xmm2
895; SSE2-NEXT:    psrlw $8, %xmm2
896; SSE2-NEXT:    psllw $8, %xmm0
897; SSE2-NEXT:    por %xmm2, %xmm0
898; SSE2-NEXT:    movdqa %xmm0, %xmm3
899; SSE2-NEXT:    psrlw $4, %xmm3
900; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
901; SSE2-NEXT:    pand %xmm2, %xmm3
902; SSE2-NEXT:    pand %xmm2, %xmm0
903; SSE2-NEXT:    psllw $4, %xmm0
904; SSE2-NEXT:    por %xmm3, %xmm0
905; SSE2-NEXT:    movdqa %xmm0, %xmm4
906; SSE2-NEXT:    psrlw $2, %xmm4
907; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
908; SSE2-NEXT:    pand %xmm3, %xmm4
909; SSE2-NEXT:    pand %xmm3, %xmm0
910; SSE2-NEXT:    psllw $2, %xmm0
911; SSE2-NEXT:    por %xmm4, %xmm0
912; SSE2-NEXT:    movdqa %xmm0, %xmm5
913; SSE2-NEXT:    psrlw $1, %xmm5
914; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
915; SSE2-NEXT:    pand %xmm4, %xmm5
916; SSE2-NEXT:    pand %xmm4, %xmm0
917; SSE2-NEXT:    paddb %xmm0, %xmm0
918; SSE2-NEXT:    por %xmm5, %xmm0
919; SSE2-NEXT:    movdqa %xmm1, %xmm5
920; SSE2-NEXT:    psrlw $8, %xmm5
921; SSE2-NEXT:    psllw $8, %xmm1
922; SSE2-NEXT:    por %xmm5, %xmm1
923; SSE2-NEXT:    movdqa %xmm1, %xmm5
924; SSE2-NEXT:    psrlw $4, %xmm5
925; SSE2-NEXT:    pand %xmm2, %xmm5
926; SSE2-NEXT:    pand %xmm2, %xmm1
927; SSE2-NEXT:    psllw $4, %xmm1
928; SSE2-NEXT:    por %xmm5, %xmm1
929; SSE2-NEXT:    movdqa %xmm1, %xmm2
930; SSE2-NEXT:    psrlw $2, %xmm2
931; SSE2-NEXT:    pand %xmm3, %xmm2
932; SSE2-NEXT:    pand %xmm3, %xmm1
933; SSE2-NEXT:    psllw $2, %xmm1
934; SSE2-NEXT:    por %xmm2, %xmm1
935; SSE2-NEXT:    movdqa %xmm1, %xmm2
936; SSE2-NEXT:    psrlw $1, %xmm2
937; SSE2-NEXT:    pand %xmm4, %xmm2
938; SSE2-NEXT:    pand %xmm4, %xmm1
939; SSE2-NEXT:    paddb %xmm1, %xmm1
940; SSE2-NEXT:    por %xmm2, %xmm1
941; SSE2-NEXT:    retq
942;
943; SSSE3-LABEL: test_bitreverse_v16i16:
944; SSSE3:       # %bb.0:
945; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
946; SSSE3-NEXT:    pshufb %xmm4, %xmm0
947; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
948; SSSE3-NEXT:    movdqa %xmm0, %xmm2
949; SSSE3-NEXT:    pand %xmm5, %xmm2
950; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
951; SSSE3-NEXT:    movdqa %xmm6, %xmm7
952; SSSE3-NEXT:    pshufb %xmm2, %xmm7
953; SSSE3-NEXT:    psrlw $4, %xmm0
954; SSSE3-NEXT:    pand %xmm5, %xmm0
955; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
956; SSSE3-NEXT:    movdqa %xmm2, %xmm3
957; SSSE3-NEXT:    pshufb %xmm0, %xmm3
958; SSSE3-NEXT:    por %xmm7, %xmm3
959; SSSE3-NEXT:    pshufb %xmm4, %xmm1
960; SSSE3-NEXT:    movdqa %xmm1, %xmm0
961; SSSE3-NEXT:    pand %xmm5, %xmm0
962; SSSE3-NEXT:    pshufb %xmm0, %xmm6
963; SSSE3-NEXT:    psrlw $4, %xmm1
964; SSSE3-NEXT:    pand %xmm5, %xmm1
965; SSSE3-NEXT:    pshufb %xmm1, %xmm2
966; SSSE3-NEXT:    por %xmm6, %xmm2
967; SSSE3-NEXT:    movdqa %xmm3, %xmm0
968; SSSE3-NEXT:    movdqa %xmm2, %xmm1
969; SSSE3-NEXT:    retq
970;
971; AVX1-LABEL: test_bitreverse_v16i16:
972; AVX1:       # %bb.0:
973; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
974; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
975; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
976; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
977; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
978; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
979; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
980; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
981; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
982; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
983; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
984; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
985; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
986; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
987; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
988; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
989; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
990; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
991; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
992; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
993; AVX1-NEXT:    retq
994;
995; AVX2-LABEL: test_bitreverse_v16i16:
996; AVX2:       # %bb.0:
997; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
998; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
999; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
1000; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1001; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
1002; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1003; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1004; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
1005; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1006; AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
1007; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1008; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
1009; AVX2-NEXT:    retq
1010;
1011; AVX512-LABEL: test_bitreverse_v16i16:
1012; AVX512:       # %bb.0:
1013; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
1014; AVX512-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1015; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
1016; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1017; AVX512-NEXT:    # ymm3 = mem[0,1,0,1]
1018; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1019; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
1020; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
1021; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1022; AVX512-NEXT:    # ymm1 = mem[0,1,0,1]
1023; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1024; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
1025; AVX512-NEXT:    retq
1026;
1027; XOPAVX1-LABEL: test_bitreverse_v16i16:
1028; XOPAVX1:       # %bb.0:
1029; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1030; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
1031; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1032; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1033; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1034; XOPAVX1-NEXT:    retq
1035;
1036; XOPAVX2-LABEL: test_bitreverse_v16i16:
1037; XOPAVX2:       # %bb.0:
1038; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1039; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
1040; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1041; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1042; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1043; XOPAVX2-NEXT:    retq
1044;
1045; GFNISSE-LABEL: test_bitreverse_v16i16:
1046; GFNISSE:       # %bb.0:
1047; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1048; GFNISSE-NEXT:    pshufb %xmm2, %xmm0
1049; GFNISSE-NEXT:    movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
1050; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
1051; GFNISSE-NEXT:    pshufb %xmm2, %xmm1
1052; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
1053; GFNISSE-NEXT:    retq
1054;
1055; GFNIAVX1-LABEL: test_bitreverse_v16i16:
1056; GFNIAVX1:       # %bb.0:
1057; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1058; GFNIAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1059; GFNIAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1060; GFNIAVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1061; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1062; GFNIAVX1-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
1063; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1064; GFNIAVX1-NEXT:    retq
1065;
1066; GFNIAVX2-LABEL: test_bitreverse_v16i16:
1067; GFNIAVX2:       # %bb.0:
1068; GFNIAVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
1069; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1070; GFNIAVX2-NEXT:    retq
1071;
1072; GFNIAVX512-LABEL: test_bitreverse_v16i16:
1073; GFNIAVX512:       # %bb.0:
1074; GFNIAVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30]
1075; GFNIAVX512-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1076; GFNIAVX512-NEXT:    retq
1077  %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
1078  ret <16 x i16> %b
1079}
1080
1081define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
1082; SSE2-LABEL: test_bitreverse_v8i32:
1083; SSE2:       # %bb.0:
1084; SSE2-NEXT:    pxor %xmm2, %xmm2
1085; SSE2-NEXT:    movdqa %xmm0, %xmm3
1086; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
1087; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
1088; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
1089; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1090; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
1091; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
1092; SSE2-NEXT:    packuswb %xmm3, %xmm0
1093; SSE2-NEXT:    movdqa %xmm0, %xmm4
1094; SSE2-NEXT:    psrlw $4, %xmm4
1095; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1096; SSE2-NEXT:    pand %xmm3, %xmm4
1097; SSE2-NEXT:    pand %xmm3, %xmm0
1098; SSE2-NEXT:    psllw $4, %xmm0
1099; SSE2-NEXT:    por %xmm4, %xmm0
1100; SSE2-NEXT:    movdqa %xmm0, %xmm5
1101; SSE2-NEXT:    psrlw $2, %xmm5
1102; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1103; SSE2-NEXT:    pand %xmm4, %xmm5
1104; SSE2-NEXT:    pand %xmm4, %xmm0
1105; SSE2-NEXT:    psllw $2, %xmm0
1106; SSE2-NEXT:    por %xmm5, %xmm0
1107; SSE2-NEXT:    movdqa %xmm0, %xmm6
1108; SSE2-NEXT:    psrlw $1, %xmm6
1109; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1110; SSE2-NEXT:    pand %xmm5, %xmm6
1111; SSE2-NEXT:    pand %xmm5, %xmm0
1112; SSE2-NEXT:    paddb %xmm0, %xmm0
1113; SSE2-NEXT:    por %xmm6, %xmm0
1114; SSE2-NEXT:    movdqa %xmm1, %xmm6
1115; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
1116; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
1117; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
1118; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1119; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
1120; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
1121; SSE2-NEXT:    packuswb %xmm6, %xmm1
1122; SSE2-NEXT:    movdqa %xmm1, %xmm2
1123; SSE2-NEXT:    psrlw $4, %xmm2
1124; SSE2-NEXT:    pand %xmm3, %xmm2
1125; SSE2-NEXT:    pand %xmm3, %xmm1
1126; SSE2-NEXT:    psllw $4, %xmm1
1127; SSE2-NEXT:    por %xmm2, %xmm1
1128; SSE2-NEXT:    movdqa %xmm1, %xmm2
1129; SSE2-NEXT:    psrlw $2, %xmm2
1130; SSE2-NEXT:    pand %xmm4, %xmm2
1131; SSE2-NEXT:    pand %xmm4, %xmm1
1132; SSE2-NEXT:    psllw $2, %xmm1
1133; SSE2-NEXT:    por %xmm2, %xmm1
1134; SSE2-NEXT:    movdqa %xmm1, %xmm2
1135; SSE2-NEXT:    psrlw $1, %xmm2
1136; SSE2-NEXT:    pand %xmm5, %xmm2
1137; SSE2-NEXT:    pand %xmm5, %xmm1
1138; SSE2-NEXT:    paddb %xmm1, %xmm1
1139; SSE2-NEXT:    por %xmm2, %xmm1
1140; SSE2-NEXT:    retq
1141;
1142; SSSE3-LABEL: test_bitreverse_v8i32:
1143; SSSE3:       # %bb.0:
1144; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1145; SSSE3-NEXT:    pshufb %xmm4, %xmm0
1146; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1147; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1148; SSSE3-NEXT:    pand %xmm5, %xmm2
1149; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1150; SSSE3-NEXT:    movdqa %xmm6, %xmm7
1151; SSSE3-NEXT:    pshufb %xmm2, %xmm7
1152; SSSE3-NEXT:    psrlw $4, %xmm0
1153; SSSE3-NEXT:    pand %xmm5, %xmm0
1154; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1155; SSSE3-NEXT:    movdqa %xmm2, %xmm3
1156; SSSE3-NEXT:    pshufb %xmm0, %xmm3
1157; SSSE3-NEXT:    por %xmm7, %xmm3
1158; SSSE3-NEXT:    pshufb %xmm4, %xmm1
1159; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1160; SSSE3-NEXT:    pand %xmm5, %xmm0
1161; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1162; SSSE3-NEXT:    psrlw $4, %xmm1
1163; SSSE3-NEXT:    pand %xmm5, %xmm1
1164; SSSE3-NEXT:    pshufb %xmm1, %xmm2
1165; SSSE3-NEXT:    por %xmm6, %xmm2
1166; SSSE3-NEXT:    movdqa %xmm3, %xmm0
1167; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1168; SSSE3-NEXT:    retq
1169;
1170; AVX1-LABEL: test_bitreverse_v8i32:
1171; AVX1:       # %bb.0:
1172; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1173; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1174; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1175; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1176; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
1177; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1178; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1179; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1180; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1181; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1182; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
1183; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
1184; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1185; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
1186; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
1187; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1188; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1189; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
1190; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
1191; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1192; AVX1-NEXT:    retq
1193;
1194; AVX2-LABEL: test_bitreverse_v8i32:
1195; AVX2:       # %bb.0:
1196; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1197; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1198; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
1199; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1200; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
1201; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1202; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1203; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
1204; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1205; AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
1206; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1207; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
1208; AVX2-NEXT:    retq
1209;
1210; AVX512-LABEL: test_bitreverse_v8i32:
1211; AVX512:       # %bb.0:
1212; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1213; AVX512-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1214; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
1215; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1216; AVX512-NEXT:    # ymm3 = mem[0,1,0,1]
1217; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1218; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
1219; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
1220; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1221; AVX512-NEXT:    # ymm1 = mem[0,1,0,1]
1222; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1223; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
1224; AVX512-NEXT:    retq
1225;
1226; XOPAVX1-LABEL: test_bitreverse_v8i32:
1227; XOPAVX1:       # %bb.0:
1228; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1229; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
1230; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1231; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1232; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1233; XOPAVX1-NEXT:    retq
1234;
1235; XOPAVX2-LABEL: test_bitreverse_v8i32:
1236; XOPAVX2:       # %bb.0:
1237; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1238; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
1239; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1240; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1241; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1242; XOPAVX2-NEXT:    retq
1243;
1244; GFNISSE-LABEL: test_bitreverse_v8i32:
1245; GFNISSE:       # %bb.0:
1246; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1247; GFNISSE-NEXT:    pshufb %xmm2, %xmm0
1248; GFNISSE-NEXT:    movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
1249; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
1250; GFNISSE-NEXT:    pshufb %xmm2, %xmm1
1251; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
1252; GFNISSE-NEXT:    retq
1253;
1254; GFNIAVX1-LABEL: test_bitreverse_v8i32:
1255; GFNIAVX1:       # %bb.0:
1256; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1257; GFNIAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
1258; GFNIAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1259; GFNIAVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1260; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1261; GFNIAVX1-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
1262; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1263; GFNIAVX1-NEXT:    retq
1264;
1265; GFNIAVX2-LABEL: test_bitreverse_v8i32:
1266; GFNIAVX2:       # %bb.0:
1267; GFNIAVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1268; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1269; GFNIAVX2-NEXT:    retq
1270;
1271; GFNIAVX512-LABEL: test_bitreverse_v8i32:
1272; GFNIAVX512:       # %bb.0:
1273; GFNIAVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28]
1274; GFNIAVX512-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1275; GFNIAVX512-NEXT:    retq
1276  %b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
1277  ret <8 x i32> %b
1278}
1279
1280define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
1281; SSE2-LABEL: test_bitreverse_v4i64:
1282; SSE2:       # %bb.0:
1283; SSE2-NEXT:    pxor %xmm2, %xmm2
1284; SSE2-NEXT:    movdqa %xmm0, %xmm3
1285; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
1286; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
1287; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
1288; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
1289; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
1290; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
1291; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
1292; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
1293; SSE2-NEXT:    packuswb %xmm3, %xmm0
1294; SSE2-NEXT:    movdqa %xmm0, %xmm4
1295; SSE2-NEXT:    psrlw $4, %xmm4
1296; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1297; SSE2-NEXT:    pand %xmm3, %xmm4
1298; SSE2-NEXT:    pand %xmm3, %xmm0
1299; SSE2-NEXT:    psllw $4, %xmm0
1300; SSE2-NEXT:    por %xmm4, %xmm0
1301; SSE2-NEXT:    movdqa %xmm0, %xmm5
1302; SSE2-NEXT:    psrlw $2, %xmm5
1303; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1304; SSE2-NEXT:    pand %xmm4, %xmm5
1305; SSE2-NEXT:    pand %xmm4, %xmm0
1306; SSE2-NEXT:    psllw $2, %xmm0
1307; SSE2-NEXT:    por %xmm5, %xmm0
1308; SSE2-NEXT:    movdqa %xmm0, %xmm6
1309; SSE2-NEXT:    psrlw $1, %xmm6
1310; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1311; SSE2-NEXT:    pand %xmm5, %xmm6
1312; SSE2-NEXT:    pand %xmm5, %xmm0
1313; SSE2-NEXT:    paddb %xmm0, %xmm0
1314; SSE2-NEXT:    por %xmm6, %xmm0
1315; SSE2-NEXT:    movdqa %xmm1, %xmm6
1316; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
1317; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,3,0,1]
1318; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,2,1,0,4,5,6,7]
1319; SSE2-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,6,5,4]
1320; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1321; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
1322; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
1323; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
1324; SSE2-NEXT:    packuswb %xmm6, %xmm1
1325; SSE2-NEXT:    movdqa %xmm1, %xmm2
1326; SSE2-NEXT:    psrlw $4, %xmm2
1327; SSE2-NEXT:    pand %xmm3, %xmm2
1328; SSE2-NEXT:    pand %xmm3, %xmm1
1329; SSE2-NEXT:    psllw $4, %xmm1
1330; SSE2-NEXT:    por %xmm2, %xmm1
1331; SSE2-NEXT:    movdqa %xmm1, %xmm2
1332; SSE2-NEXT:    psrlw $2, %xmm2
1333; SSE2-NEXT:    pand %xmm4, %xmm2
1334; SSE2-NEXT:    pand %xmm4, %xmm1
1335; SSE2-NEXT:    psllw $2, %xmm1
1336; SSE2-NEXT:    por %xmm2, %xmm1
1337; SSE2-NEXT:    movdqa %xmm1, %xmm2
1338; SSE2-NEXT:    psrlw $1, %xmm2
1339; SSE2-NEXT:    pand %xmm5, %xmm2
1340; SSE2-NEXT:    pand %xmm5, %xmm1
1341; SSE2-NEXT:    paddb %xmm1, %xmm1
1342; SSE2-NEXT:    por %xmm2, %xmm1
1343; SSE2-NEXT:    retq
1344;
1345; SSSE3-LABEL: test_bitreverse_v4i64:
1346; SSSE3:       # %bb.0:
1347; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1348; SSSE3-NEXT:    pshufb %xmm4, %xmm0
1349; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1350; SSSE3-NEXT:    movdqa %xmm0, %xmm2
1351; SSSE3-NEXT:    pand %xmm5, %xmm2
1352; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1353; SSSE3-NEXT:    movdqa %xmm6, %xmm7
1354; SSSE3-NEXT:    pshufb %xmm2, %xmm7
1355; SSSE3-NEXT:    psrlw $4, %xmm0
1356; SSSE3-NEXT:    pand %xmm5, %xmm0
1357; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1358; SSSE3-NEXT:    movdqa %xmm2, %xmm3
1359; SSSE3-NEXT:    pshufb %xmm0, %xmm3
1360; SSSE3-NEXT:    por %xmm7, %xmm3
1361; SSSE3-NEXT:    pshufb %xmm4, %xmm1
1362; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1363; SSSE3-NEXT:    pand %xmm5, %xmm0
1364; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1365; SSSE3-NEXT:    psrlw $4, %xmm1
1366; SSSE3-NEXT:    pand %xmm5, %xmm1
1367; SSSE3-NEXT:    pshufb %xmm1, %xmm2
1368; SSSE3-NEXT:    por %xmm6, %xmm2
1369; SSSE3-NEXT:    movdqa %xmm3, %xmm0
1370; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1371; SSSE3-NEXT:    retq
1372;
1373; AVX1-LABEL: test_bitreverse_v4i64:
1374; AVX1:       # %bb.0:
1375; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1376; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1377; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1378; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1379; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
1380; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1381; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1382; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1383; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1384; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1385; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
1386; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
1387; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1388; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
1389; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
1390; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1391; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1392; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
1393; AVX1-NEXT:    vpor %xmm0, %xmm2, %xmm0
1394; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1395; AVX1-NEXT:    retq
1396;
1397; AVX2-LABEL: test_bitreverse_v4i64:
1398; AVX2:       # %bb.0:
1399; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1400; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1401; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm2
1402; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1403; AVX2-NEXT:    # ymm3 = mem[0,1,0,1]
1404; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1405; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1406; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
1407; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1408; AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
1409; AVX2-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1410; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
1411; AVX2-NEXT:    retq
1412;
1413; AVX512-LABEL: test_bitreverse_v4i64:
1414; AVX512:       # %bb.0:
1415; AVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1416; AVX512-NEXT:    vpbroadcastb {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1417; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm2
1418; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1419; AVX512-NEXT:    # ymm3 = mem[0,1,0,1]
1420; AVX512-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
1421; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
1422; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
1423; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1424; AVX512-NEXT:    # ymm1 = mem[0,1,0,1]
1425; AVX512-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
1426; AVX512-NEXT:    vpor %ymm0, %ymm2, %ymm0
1427; AVX512-NEXT:    retq
1428;
1429; XOPAVX1-LABEL: test_bitreverse_v4i64:
1430; XOPAVX1:       # %bb.0:
1431; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1432; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
1433; XOPAVX1-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1434; XOPAVX1-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1435; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1436; XOPAVX1-NEXT:    retq
1437;
1438; XOPAVX2-LABEL: test_bitreverse_v4i64:
1439; XOPAVX2:       # %bb.0:
1440; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1441; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
1442; XOPAVX2-NEXT:    vpperm %xmm2, %xmm1, %xmm0, %xmm1
1443; XOPAVX2-NEXT:    vpperm %xmm2, %xmm0, %xmm0, %xmm0
1444; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1445; XOPAVX2-NEXT:    retq
1446;
1447; GFNISSE-LABEL: test_bitreverse_v4i64:
1448; GFNISSE:       # %bb.0:
1449; GFNISSE-NEXT:    movdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1450; GFNISSE-NEXT:    pshufb %xmm2, %xmm0
1451; GFNISSE-NEXT:    movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
1452; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm0
1453; GFNISSE-NEXT:    pshufb %xmm2, %xmm1
1454; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm3, %xmm1
1455; GFNISSE-NEXT:    retq
1456;
1457; GFNIAVX1-LABEL: test_bitreverse_v4i64:
1458; GFNIAVX1:       # %bb.0:
1459; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1460; GFNIAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
1461; GFNIAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1462; GFNIAVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
1463; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1464; GFNIAVX1-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
1465; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm1, %ymm0, %ymm0
1466; GFNIAVX1-NEXT:    retq
1467;
1468; GFNIAVX2-LABEL: test_bitreverse_v4i64:
1469; GFNIAVX2:       # %bb.0:
1470; GFNIAVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1471; GFNIAVX2-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1472; GFNIAVX2-NEXT:    retq
1473;
1474; GFNIAVX512-LABEL: test_bitreverse_v4i64:
1475; GFNIAVX512:       # %bb.0:
1476; GFNIAVX512-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24]
1477; GFNIAVX512-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1478; GFNIAVX512-NEXT:    retq
1479  %b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)
1480  ret <4 x i64> %b
1481}
1482
1483define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
1484; SSE2-LABEL: test_bitreverse_v64i8:
1485; SSE2:       # %bb.0:
1486; SSE2-NEXT:    movdqa %xmm0, %xmm5
1487; SSE2-NEXT:    psrlw $4, %xmm5
1488; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1489; SSE2-NEXT:    pand %xmm4, %xmm5
1490; SSE2-NEXT:    pand %xmm4, %xmm0
1491; SSE2-NEXT:    psllw $4, %xmm0
1492; SSE2-NEXT:    por %xmm5, %xmm0
1493; SSE2-NEXT:    movdqa %xmm0, %xmm6
1494; SSE2-NEXT:    psrlw $2, %xmm6
1495; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1496; SSE2-NEXT:    pand %xmm5, %xmm6
1497; SSE2-NEXT:    pand %xmm5, %xmm0
1498; SSE2-NEXT:    psllw $2, %xmm0
1499; SSE2-NEXT:    por %xmm6, %xmm0
1500; SSE2-NEXT:    movdqa %xmm0, %xmm7
1501; SSE2-NEXT:    psrlw $1, %xmm7
1502; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1503; SSE2-NEXT:    pand %xmm6, %xmm7
1504; SSE2-NEXT:    pand %xmm6, %xmm0
1505; SSE2-NEXT:    paddb %xmm0, %xmm0
1506; SSE2-NEXT:    por %xmm7, %xmm0
1507; SSE2-NEXT:    movdqa %xmm1, %xmm7
1508; SSE2-NEXT:    psrlw $4, %xmm7
1509; SSE2-NEXT:    pand %xmm4, %xmm7
1510; SSE2-NEXT:    pand %xmm4, %xmm1
1511; SSE2-NEXT:    psllw $4, %xmm1
1512; SSE2-NEXT:    por %xmm7, %xmm1
1513; SSE2-NEXT:    movdqa %xmm1, %xmm7
1514; SSE2-NEXT:    psrlw $2, %xmm7
1515; SSE2-NEXT:    pand %xmm5, %xmm7
1516; SSE2-NEXT:    pand %xmm5, %xmm1
1517; SSE2-NEXT:    psllw $2, %xmm1
1518; SSE2-NEXT:    por %xmm7, %xmm1
1519; SSE2-NEXT:    movdqa %xmm1, %xmm7
1520; SSE2-NEXT:    psrlw $1, %xmm7
1521; SSE2-NEXT:    pand %xmm6, %xmm7
1522; SSE2-NEXT:    pand %xmm6, %xmm1
1523; SSE2-NEXT:    paddb %xmm1, %xmm1
1524; SSE2-NEXT:    por %xmm7, %xmm1
1525; SSE2-NEXT:    movdqa %xmm2, %xmm7
1526; SSE2-NEXT:    psrlw $4, %xmm7
1527; SSE2-NEXT:    pand %xmm4, %xmm7
1528; SSE2-NEXT:    pand %xmm4, %xmm2
1529; SSE2-NEXT:    psllw $4, %xmm2
1530; SSE2-NEXT:    por %xmm7, %xmm2
1531; SSE2-NEXT:    movdqa %xmm2, %xmm7
1532; SSE2-NEXT:    psrlw $2, %xmm7
1533; SSE2-NEXT:    pand %xmm5, %xmm7
1534; SSE2-NEXT:    pand %xmm5, %xmm2
1535; SSE2-NEXT:    psllw $2, %xmm2
1536; SSE2-NEXT:    por %xmm7, %xmm2
1537; SSE2-NEXT:    movdqa %xmm2, %xmm7
1538; SSE2-NEXT:    psrlw $1, %xmm7
1539; SSE2-NEXT:    pand %xmm6, %xmm7
1540; SSE2-NEXT:    pand %xmm6, %xmm2
1541; SSE2-NEXT:    paddb %xmm2, %xmm2
1542; SSE2-NEXT:    por %xmm7, %xmm2
1543; SSE2-NEXT:    movdqa %xmm3, %xmm7
1544; SSE2-NEXT:    psrlw $4, %xmm7
1545; SSE2-NEXT:    pand %xmm4, %xmm7
1546; SSE2-NEXT:    pand %xmm4, %xmm3
1547; SSE2-NEXT:    psllw $4, %xmm3
1548; SSE2-NEXT:    por %xmm7, %xmm3
1549; SSE2-NEXT:    movdqa %xmm3, %xmm4
1550; SSE2-NEXT:    psrlw $2, %xmm4
1551; SSE2-NEXT:    pand %xmm5, %xmm4
1552; SSE2-NEXT:    pand %xmm5, %xmm3
1553; SSE2-NEXT:    psllw $2, %xmm3
1554; SSE2-NEXT:    por %xmm4, %xmm3
1555; SSE2-NEXT:    movdqa %xmm3, %xmm4
1556; SSE2-NEXT:    psrlw $1, %xmm4
1557; SSE2-NEXT:    pand %xmm6, %xmm4
1558; SSE2-NEXT:    pand %xmm6, %xmm3
1559; SSE2-NEXT:    paddb %xmm3, %xmm3
1560; SSE2-NEXT:    por %xmm4, %xmm3
1561; SSE2-NEXT:    retq
1562;
1563; SSSE3-LABEL: test_bitreverse_v64i8:
1564; SSSE3:       # %bb.0:
1565; SSSE3-NEXT:    movdqa %xmm0, %xmm5
1566; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1567; SSSE3-NEXT:    pand %xmm8, %xmm0
1568; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1569; SSSE3-NEXT:    movdqa %xmm7, %xmm6
1570; SSSE3-NEXT:    pshufb %xmm0, %xmm6
1571; SSSE3-NEXT:    psrlw $4, %xmm5
1572; SSSE3-NEXT:    pand %xmm8, %xmm5
1573; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1574; SSSE3-NEXT:    movdqa %xmm4, %xmm0
1575; SSSE3-NEXT:    pshufb %xmm5, %xmm0
1576; SSSE3-NEXT:    por %xmm6, %xmm0
1577; SSSE3-NEXT:    movdqa %xmm1, %xmm5
1578; SSSE3-NEXT:    pand %xmm8, %xmm5
1579; SSSE3-NEXT:    movdqa %xmm7, %xmm6
1580; SSSE3-NEXT:    pshufb %xmm5, %xmm6
1581; SSSE3-NEXT:    psrlw $4, %xmm1
1582; SSSE3-NEXT:    pand %xmm8, %xmm1
1583; SSSE3-NEXT:    movdqa %xmm4, %xmm5
1584; SSSE3-NEXT:    pshufb %xmm1, %xmm5
1585; SSSE3-NEXT:    por %xmm6, %xmm5
1586; SSSE3-NEXT:    movdqa %xmm2, %xmm1
1587; SSSE3-NEXT:    pand %xmm8, %xmm1
1588; SSSE3-NEXT:    movdqa %xmm7, %xmm9
1589; SSSE3-NEXT:    pshufb %xmm1, %xmm9
1590; SSSE3-NEXT:    psrlw $4, %xmm2
1591; SSSE3-NEXT:    pand %xmm8, %xmm2
1592; SSSE3-NEXT:    movdqa %xmm4, %xmm6
1593; SSSE3-NEXT:    pshufb %xmm2, %xmm6
1594; SSSE3-NEXT:    por %xmm9, %xmm6
1595; SSSE3-NEXT:    movdqa %xmm3, %xmm1
1596; SSSE3-NEXT:    pand %xmm8, %xmm1
1597; SSSE3-NEXT:    pshufb %xmm1, %xmm7
1598; SSSE3-NEXT:    psrlw $4, %xmm3
1599; SSSE3-NEXT:    pand %xmm8, %xmm3
1600; SSSE3-NEXT:    pshufb %xmm3, %xmm4
1601; SSSE3-NEXT:    por %xmm7, %xmm4
1602; SSSE3-NEXT:    movdqa %xmm5, %xmm1
1603; SSSE3-NEXT:    movdqa %xmm6, %xmm2
1604; SSSE3-NEXT:    movdqa %xmm4, %xmm3
1605; SSSE3-NEXT:    retq
1606;
1607; AVX1-LABEL: test_bitreverse_v64i8:
1608; AVX1:       # %bb.0:
1609; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1610; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1611; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
1612; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1613; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1614; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
1615; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1616; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1617; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
1618; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
1619; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm4
1620; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1621; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1622; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1623; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
1624; AVX1-NEXT:    vpor %xmm0, %xmm4, %xmm0
1625; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1626; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1627; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
1628; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1629; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
1630; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1631; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
1632; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
1633; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
1634; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
1635; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1636; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1637; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
1638; AVX1-NEXT:    vpor %xmm1, %xmm4, %xmm1
1639; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1640; AVX1-NEXT:    retq
1641;
1642; AVX2-LABEL: test_bitreverse_v64i8:
1643; AVX2:       # %bb.0:
1644; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1645; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm3
1646; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1647; AVX2-NEXT:    # ymm4 = mem[0,1,0,1]
1648; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
1649; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1650; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
1651; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1652; AVX2-NEXT:    # ymm5 = mem[0,1,0,1]
1653; AVX2-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
1654; AVX2-NEXT:    vpor %ymm0, %ymm3, %ymm0
1655; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm3
1656; AVX2-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
1657; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
1658; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
1659; AVX2-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
1660; AVX2-NEXT:    vpor %ymm1, %ymm3, %ymm1
1661; AVX2-NEXT:    retq
1662;
1663; AVX512F-LABEL: test_bitreverse_v64i8:
1664; AVX512F:       # %bb.0:
1665; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1666; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1667; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm3
1668; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1669; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
1670; AVX512F-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
1671; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm5
1672; AVX512F-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
1673; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
1674; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
1675; AVX512F-NEXT:    vpand %ymm2, %ymm1, %ymm1
1676; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1677; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
1678; AVX512F-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
1679; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
1680; AVX512F-NEXT:    vpand %ymm2, %ymm0, %ymm0
1681; AVX512F-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
1682; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1683; AVX512F-NEXT:    vporq %zmm0, %zmm3, %zmm0
1684; AVX512F-NEXT:    retq
1685;
1686; AVX512BW-LABEL: test_bitreverse_v64i8:
1687; AVX512BW:       # %bb.0:
1688; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1689; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
1690; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1691; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1692; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
1693; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
1694; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
1695; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1696; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1697; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
1698; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
1699; AVX512BW-NEXT:    retq
1700;
1701; XOPAVX1-LABEL: test_bitreverse_v64i8:
1702; XOPAVX1:       # %bb.0:
1703; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1704; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
1705; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1706; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
1707; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1708; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1709; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1710; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
1711; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1712; XOPAVX1-NEXT:    retq
1713;
1714; XOPAVX2-LABEL: test_bitreverse_v64i8:
1715; XOPAVX2:       # %bb.0:
1716; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
1717; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95]
1718; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1719; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
1720; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
1721; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
1722; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
1723; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
1724; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
1725; XOPAVX2-NEXT:    retq
1726;
1727; GFNISSE-LABEL: test_bitreverse_v64i8:
1728; GFNISSE:       # %bb.0:
1729; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
1730; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm0
1731; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm1
1732; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm2
1733; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm4, %xmm3
1734; GFNISSE-NEXT:    retq
1735;
1736; GFNIAVX1-LABEL: test_bitreverse_v64i8:
1737; GFNIAVX1:       # %bb.0:
1738; GFNIAVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
1739; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
1740; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
1741; GFNIAVX1-NEXT:    retq
1742;
1743; GFNIAVX2-LABEL: test_bitreverse_v64i8:
1744; GFNIAVX2:       # %bb.0:
1745; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
1746; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
1747; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
1748; GFNIAVX2-NEXT:    retq
1749;
1750; GFNIAVX512-LABEL: test_bitreverse_v64i8:
1751; GFNIAVX512:       # %bb.0:
1752; GFNIAVX512-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
1753; GFNIAVX512-NEXT:    retq
1754  %b = call <64 x i8> @llvm.bitreverse.v64i8(<64 x i8> %a)
1755  ret <64 x i8> %b
1756}
1757
1758define <32 x i16> @test_bitreverse_v32i16(<32 x i16> %a) nounwind {
1759; SSE2-LABEL: test_bitreverse_v32i16:
1760; SSE2:       # %bb.0:
1761; SSE2-NEXT:    movdqa %xmm0, %xmm4
1762; SSE2-NEXT:    psrlw $8, %xmm4
1763; SSE2-NEXT:    psllw $8, %xmm0
1764; SSE2-NEXT:    por %xmm4, %xmm0
1765; SSE2-NEXT:    movdqa %xmm0, %xmm5
1766; SSE2-NEXT:    psrlw $4, %xmm5
1767; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1768; SSE2-NEXT:    pand %xmm4, %xmm5
1769; SSE2-NEXT:    pand %xmm4, %xmm0
1770; SSE2-NEXT:    psllw $4, %xmm0
1771; SSE2-NEXT:    por %xmm5, %xmm0
1772; SSE2-NEXT:    movdqa %xmm0, %xmm6
1773; SSE2-NEXT:    psrlw $2, %xmm6
1774; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
1775; SSE2-NEXT:    pand %xmm5, %xmm6
1776; SSE2-NEXT:    pand %xmm5, %xmm0
1777; SSE2-NEXT:    psllw $2, %xmm0
1778; SSE2-NEXT:    por %xmm6, %xmm0
1779; SSE2-NEXT:    movdqa %xmm0, %xmm7
1780; SSE2-NEXT:    psrlw $1, %xmm7
1781; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
1782; SSE2-NEXT:    pand %xmm6, %xmm7
1783; SSE2-NEXT:    pand %xmm6, %xmm0
1784; SSE2-NEXT:    paddb %xmm0, %xmm0
1785; SSE2-NEXT:    por %xmm7, %xmm0
1786; SSE2-NEXT:    movdqa %xmm1, %xmm7
1787; SSE2-NEXT:    psrlw $8, %xmm7
1788; SSE2-NEXT:    psllw $8, %xmm1
1789; SSE2-NEXT:    por %xmm7, %xmm1
1790; SSE2-NEXT:    movdqa %xmm1, %xmm7
1791; SSE2-NEXT:    psrlw $4, %xmm7
1792; SSE2-NEXT:    pand %xmm4, %xmm7
1793; SSE2-NEXT:    pand %xmm4, %xmm1
1794; SSE2-NEXT:    psllw $4, %xmm1
1795; SSE2-NEXT:    por %xmm7, %xmm1
1796; SSE2-NEXT:    movdqa %xmm1, %xmm7
1797; SSE2-NEXT:    psrlw $2, %xmm7
1798; SSE2-NEXT:    pand %xmm5, %xmm7
1799; SSE2-NEXT:    pand %xmm5, %xmm1
1800; SSE2-NEXT:    psllw $2, %xmm1
1801; SSE2-NEXT:    por %xmm7, %xmm1
1802; SSE2-NEXT:    movdqa %xmm1, %xmm7
1803; SSE2-NEXT:    psrlw $1, %xmm7
1804; SSE2-NEXT:    pand %xmm6, %xmm7
1805; SSE2-NEXT:    pand %xmm6, %xmm1
1806; SSE2-NEXT:    paddb %xmm1, %xmm1
1807; SSE2-NEXT:    por %xmm7, %xmm1
1808; SSE2-NEXT:    movdqa %xmm2, %xmm7
1809; SSE2-NEXT:    psrlw $8, %xmm7
1810; SSE2-NEXT:    psllw $8, %xmm2
1811; SSE2-NEXT:    por %xmm7, %xmm2
1812; SSE2-NEXT:    movdqa %xmm2, %xmm7
1813; SSE2-NEXT:    psrlw $4, %xmm7
1814; SSE2-NEXT:    pand %xmm4, %xmm7
1815; SSE2-NEXT:    pand %xmm4, %xmm2
1816; SSE2-NEXT:    psllw $4, %xmm2
1817; SSE2-NEXT:    por %xmm7, %xmm2
1818; SSE2-NEXT:    movdqa %xmm2, %xmm7
1819; SSE2-NEXT:    psrlw $2, %xmm7
1820; SSE2-NEXT:    pand %xmm5, %xmm7
1821; SSE2-NEXT:    pand %xmm5, %xmm2
1822; SSE2-NEXT:    psllw $2, %xmm2
1823; SSE2-NEXT:    por %xmm7, %xmm2
1824; SSE2-NEXT:    movdqa %xmm2, %xmm7
1825; SSE2-NEXT:    psrlw $1, %xmm7
1826; SSE2-NEXT:    pand %xmm6, %xmm7
1827; SSE2-NEXT:    pand %xmm6, %xmm2
1828; SSE2-NEXT:    paddb %xmm2, %xmm2
1829; SSE2-NEXT:    por %xmm7, %xmm2
1830; SSE2-NEXT:    movdqa %xmm3, %xmm7
1831; SSE2-NEXT:    psrlw $8, %xmm7
1832; SSE2-NEXT:    psllw $8, %xmm3
1833; SSE2-NEXT:    por %xmm7, %xmm3
1834; SSE2-NEXT:    movdqa %xmm3, %xmm7
1835; SSE2-NEXT:    psrlw $4, %xmm7
1836; SSE2-NEXT:    pand %xmm4, %xmm7
1837; SSE2-NEXT:    pand %xmm4, %xmm3
1838; SSE2-NEXT:    psllw $4, %xmm3
1839; SSE2-NEXT:    por %xmm7, %xmm3
1840; SSE2-NEXT:    movdqa %xmm3, %xmm4
1841; SSE2-NEXT:    psrlw $2, %xmm4
1842; SSE2-NEXT:    pand %xmm5, %xmm4
1843; SSE2-NEXT:    pand %xmm5, %xmm3
1844; SSE2-NEXT:    psllw $2, %xmm3
1845; SSE2-NEXT:    por %xmm4, %xmm3
1846; SSE2-NEXT:    movdqa %xmm3, %xmm4
1847; SSE2-NEXT:    psrlw $1, %xmm4
1848; SSE2-NEXT:    pand %xmm6, %xmm4
1849; SSE2-NEXT:    pand %xmm6, %xmm3
1850; SSE2-NEXT:    paddb %xmm3, %xmm3
1851; SSE2-NEXT:    por %xmm4, %xmm3
1852; SSE2-NEXT:    retq
1853;
1854; SSSE3-LABEL: test_bitreverse_v32i16:
1855; SSSE3:       # %bb.0:
1856; SSSE3-NEXT:    movdqa %xmm1, %xmm5
1857; SSSE3-NEXT:    movdqa %xmm0, %xmm1
1858; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1859; SSSE3-NEXT:    pshufb %xmm8, %xmm1
1860; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1861; SSSE3-NEXT:    movdqa %xmm1, %xmm0
1862; SSSE3-NEXT:    pand %xmm7, %xmm0
1863; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1864; SSSE3-NEXT:    movdqa %xmm6, %xmm9
1865; SSSE3-NEXT:    pshufb %xmm0, %xmm9
1866; SSSE3-NEXT:    psrlw $4, %xmm1
1867; SSSE3-NEXT:    pand %xmm7, %xmm1
1868; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1869; SSSE3-NEXT:    movdqa %xmm4, %xmm0
1870; SSSE3-NEXT:    pshufb %xmm1, %xmm0
1871; SSSE3-NEXT:    por %xmm9, %xmm0
1872; SSSE3-NEXT:    pshufb %xmm8, %xmm5
1873; SSSE3-NEXT:    movdqa %xmm5, %xmm1
1874; SSSE3-NEXT:    pand %xmm7, %xmm1
1875; SSSE3-NEXT:    movdqa %xmm6, %xmm9
1876; SSSE3-NEXT:    pshufb %xmm1, %xmm9
1877; SSSE3-NEXT:    psrlw $4, %xmm5
1878; SSSE3-NEXT:    pand %xmm7, %xmm5
1879; SSSE3-NEXT:    movdqa %xmm4, %xmm1
1880; SSSE3-NEXT:    pshufb %xmm5, %xmm1
1881; SSSE3-NEXT:    por %xmm9, %xmm1
1882; SSSE3-NEXT:    pshufb %xmm8, %xmm2
1883; SSSE3-NEXT:    movdqa %xmm2, %xmm5
1884; SSSE3-NEXT:    pand %xmm7, %xmm5
1885; SSSE3-NEXT:    movdqa %xmm6, %xmm9
1886; SSSE3-NEXT:    pshufb %xmm5, %xmm9
1887; SSSE3-NEXT:    psrlw $4, %xmm2
1888; SSSE3-NEXT:    pand %xmm7, %xmm2
1889; SSSE3-NEXT:    movdqa %xmm4, %xmm5
1890; SSSE3-NEXT:    pshufb %xmm2, %xmm5
1891; SSSE3-NEXT:    por %xmm9, %xmm5
1892; SSSE3-NEXT:    pshufb %xmm8, %xmm3
1893; SSSE3-NEXT:    movdqa %xmm3, %xmm2
1894; SSSE3-NEXT:    pand %xmm7, %xmm2
1895; SSSE3-NEXT:    pshufb %xmm2, %xmm6
1896; SSSE3-NEXT:    psrlw $4, %xmm3
1897; SSSE3-NEXT:    pand %xmm7, %xmm3
1898; SSSE3-NEXT:    pshufb %xmm3, %xmm4
1899; SSSE3-NEXT:    por %xmm6, %xmm4
1900; SSSE3-NEXT:    movdqa %xmm5, %xmm2
1901; SSSE3-NEXT:    movdqa %xmm4, %xmm3
1902; SSSE3-NEXT:    retq
1903;
1904; AVX1-LABEL: test_bitreverse_v32i16:
1905; AVX1:       # %bb.0:
1906; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1907; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1908; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1909; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1910; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
1911; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1912; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
1913; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
1914; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
1915; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1916; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
1917; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
1918; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
1919; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
1920; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
1921; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
1922; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
1923; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
1924; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
1925; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1926; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
1927; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
1928; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
1929; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
1930; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
1931; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
1932; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
1933; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
1934; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
1935; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
1936; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
1937; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
1938; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
1939; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
1940; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
1941; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
1942; AVX1-NEXT:    retq
1943;
1944; AVX2-LABEL: test_bitreverse_v32i16:
1945; AVX2:       # %bb.0:
1946; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1947; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
1948; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
1949; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1950; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
1951; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1952; AVX2-NEXT:    # ymm5 = mem[0,1,0,1]
1953; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
1954; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
1955; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
1956; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1957; AVX2-NEXT:    # ymm6 = mem[0,1,0,1]
1958; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
1959; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
1960; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
1961; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
1962; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
1963; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
1964; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
1965; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
1966; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
1967; AVX2-NEXT:    retq
1968;
1969; AVX512F-LABEL: test_bitreverse_v32i16:
1970; AVX512F:       # %bb.0:
1971; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
1972; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
1973; AVX512F-NEXT:    # ymm2 = mem[0,1,0,1]
1974; AVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
1975; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
1976; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm4
1977; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
1978; AVX512F-NEXT:    # ymm5 = mem[0,1,0,1]
1979; AVX512F-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
1980; AVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
1981; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm2
1982; AVX512F-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
1983; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
1984; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
1985; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm1
1986; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
1987; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
1988; AVX512F-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
1989; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
1990; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
1991; AVX512F-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
1992; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
1993; AVX512F-NEXT:    vporq %zmm0, %zmm2, %zmm0
1994; AVX512F-NEXT:    retq
1995;
1996; AVX512BW-LABEL: test_bitreverse_v32i16:
1997; AVX512BW:       # %bb.0:
1998; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62]
1999; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2000; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
2001; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2002; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2003; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
2004; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
2005; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
2006; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2007; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2008; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
2009; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
2010; AVX512BW-NEXT:    retq
2011;
2012; XOPAVX1-LABEL: test_bitreverse_v32i16:
2013; XOPAVX1:       # %bb.0:
2014; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2015; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
2016; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2017; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2018; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2019; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2020; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2021; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2022; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2023; XOPAVX1-NEXT:    retq
2024;
2025; XOPAVX2-LABEL: test_bitreverse_v32i16:
2026; XOPAVX2:       # %bb.0:
2027; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
2028; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [81,80,83,82,85,84,87,86,89,88,91,90,93,92,95,94]
2029; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2030; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2031; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
2032; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2033; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2034; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2035; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2036; XOPAVX2-NEXT:    retq
2037;
2038; GFNISSE-LABEL: test_bitreverse_v32i16:
2039; GFNISSE:       # %bb.0:
2040; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2041; GFNISSE-NEXT:    pshufb %xmm4, %xmm0
2042; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2043; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
2044; GFNISSE-NEXT:    pshufb %xmm4, %xmm1
2045; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
2046; GFNISSE-NEXT:    pshufb %xmm4, %xmm2
2047; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
2048; GFNISSE-NEXT:    pshufb %xmm4, %xmm3
2049; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
2050; GFNISSE-NEXT:    retq
2051;
2052; GFNIAVX1-LABEL: test_bitreverse_v32i16:
2053; GFNIAVX1:       # %bb.0:
2054; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2055; GFNIAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2056; GFNIAVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2057; GFNIAVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2058; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2059; GFNIAVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2060; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
2061; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
2062; GFNIAVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
2063; GFNIAVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2064; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
2065; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
2066; GFNIAVX1-NEXT:    retq
2067;
2068; GFNIAVX2-LABEL: test_bitreverse_v32i16:
2069; GFNIAVX2:       # %bb.0:
2070; GFNIAVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2071; GFNIAVX2-NEXT:    # ymm2 = mem[0,1,0,1]
2072; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2073; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2074; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
2075; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2076; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
2077; GFNIAVX2-NEXT:    retq
2078;
2079; GFNIAVX512F-LABEL: test_bitreverse_v32i16:
2080; GFNIAVX512F:       # %bb.0:
2081; GFNIAVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2082; GFNIAVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
2083; GFNIAVX512F-NEXT:    # ymm2 = mem[0,1,0,1]
2084; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2085; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2086; GFNIAVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2087; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
2088; GFNIAVX512F-NEXT:    retq
2089;
2090; GFNIAVX512BW-LABEL: test_bitreverse_v32i16:
2091; GFNIAVX512BW:       # %bb.0:
2092; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14,17,16,19,18,21,20,23,22,25,24,27,26,29,28,31,30,33,32,35,34,37,36,39,38,41,40,43,42,45,44,47,46,49,48,51,50,53,52,55,54,57,56,59,58,61,60,63,62]
2093; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
2094; GFNIAVX512BW-NEXT:    retq
2095  %b = call <32 x i16> @llvm.bitreverse.v32i16(<32 x i16> %a)
2096  ret <32 x i16> %b
2097}
2098
2099define <16 x i32> @test_bitreverse_v16i32(<16 x i32> %a) nounwind {
2100; SSE2-LABEL: test_bitreverse_v16i32:
2101; SSE2:       # %bb.0:
2102; SSE2-NEXT:    pxor %xmm4, %xmm4
2103; SSE2-NEXT:    movdqa %xmm0, %xmm5
2104; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
2105; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
2106; SSE2-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
2107; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
2108; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
2109; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
2110; SSE2-NEXT:    packuswb %xmm5, %xmm0
2111; SSE2-NEXT:    movdqa %xmm0, %xmm6
2112; SSE2-NEXT:    psrlw $4, %xmm6
2113; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2114; SSE2-NEXT:    pand %xmm5, %xmm6
2115; SSE2-NEXT:    pand %xmm5, %xmm0
2116; SSE2-NEXT:    psllw $4, %xmm0
2117; SSE2-NEXT:    por %xmm6, %xmm0
2118; SSE2-NEXT:    movdqa %xmm0, %xmm7
2119; SSE2-NEXT:    psrlw $2, %xmm7
2120; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
2121; SSE2-NEXT:    pand %xmm6, %xmm7
2122; SSE2-NEXT:    pand %xmm6, %xmm0
2123; SSE2-NEXT:    psllw $2, %xmm0
2124; SSE2-NEXT:    por %xmm7, %xmm0
2125; SSE2-NEXT:    movdqa %xmm0, %xmm8
2126; SSE2-NEXT:    psrlw $1, %xmm8
2127; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
2128; SSE2-NEXT:    pand %xmm7, %xmm8
2129; SSE2-NEXT:    pand %xmm7, %xmm0
2130; SSE2-NEXT:    paddb %xmm0, %xmm0
2131; SSE2-NEXT:    por %xmm8, %xmm0
2132; SSE2-NEXT:    movdqa %xmm1, %xmm8
2133; SSE2-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
2134; SSE2-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[3,2,1,0,4,5,6,7]
2135; SSE2-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4]
2136; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
2137; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
2138; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
2139; SSE2-NEXT:    packuswb %xmm8, %xmm1
2140; SSE2-NEXT:    movdqa %xmm1, %xmm8
2141; SSE2-NEXT:    psrlw $4, %xmm8
2142; SSE2-NEXT:    pand %xmm5, %xmm8
2143; SSE2-NEXT:    pand %xmm5, %xmm1
2144; SSE2-NEXT:    psllw $4, %xmm1
2145; SSE2-NEXT:    por %xmm8, %xmm1
2146; SSE2-NEXT:    movdqa %xmm1, %xmm8
2147; SSE2-NEXT:    psrlw $2, %xmm8
2148; SSE2-NEXT:    pand %xmm6, %xmm8
2149; SSE2-NEXT:    pand %xmm6, %xmm1
2150; SSE2-NEXT:    psllw $2, %xmm1
2151; SSE2-NEXT:    por %xmm8, %xmm1
2152; SSE2-NEXT:    movdqa %xmm1, %xmm8
2153; SSE2-NEXT:    psrlw $1, %xmm8
2154; SSE2-NEXT:    pand %xmm7, %xmm8
2155; SSE2-NEXT:    pand %xmm7, %xmm1
2156; SSE2-NEXT:    paddb %xmm1, %xmm1
2157; SSE2-NEXT:    por %xmm8, %xmm1
2158; SSE2-NEXT:    movdqa %xmm2, %xmm8
2159; SSE2-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
2160; SSE2-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[3,2,1,0,4,5,6,7]
2161; SSE2-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4]
2162; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
2163; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
2164; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
2165; SSE2-NEXT:    packuswb %xmm8, %xmm2
2166; SSE2-NEXT:    movdqa %xmm2, %xmm8
2167; SSE2-NEXT:    psrlw $4, %xmm8
2168; SSE2-NEXT:    pand %xmm5, %xmm8
2169; SSE2-NEXT:    pand %xmm5, %xmm2
2170; SSE2-NEXT:    psllw $4, %xmm2
2171; SSE2-NEXT:    por %xmm8, %xmm2
2172; SSE2-NEXT:    movdqa %xmm2, %xmm8
2173; SSE2-NEXT:    psrlw $2, %xmm8
2174; SSE2-NEXT:    pand %xmm6, %xmm8
2175; SSE2-NEXT:    pand %xmm6, %xmm2
2176; SSE2-NEXT:    psllw $2, %xmm2
2177; SSE2-NEXT:    por %xmm8, %xmm2
2178; SSE2-NEXT:    movdqa %xmm2, %xmm8
2179; SSE2-NEXT:    psrlw $1, %xmm8
2180; SSE2-NEXT:    pand %xmm7, %xmm8
2181; SSE2-NEXT:    pand %xmm7, %xmm2
2182; SSE2-NEXT:    paddb %xmm2, %xmm2
2183; SSE2-NEXT:    por %xmm8, %xmm2
2184; SSE2-NEXT:    movdqa %xmm3, %xmm8
2185; SSE2-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
2186; SSE2-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[3,2,1,0,4,5,6,7]
2187; SSE2-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4]
2188; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
2189; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
2190; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
2191; SSE2-NEXT:    packuswb %xmm8, %xmm3
2192; SSE2-NEXT:    movdqa %xmm3, %xmm4
2193; SSE2-NEXT:    psrlw $4, %xmm4
2194; SSE2-NEXT:    pand %xmm5, %xmm4
2195; SSE2-NEXT:    pand %xmm5, %xmm3
2196; SSE2-NEXT:    psllw $4, %xmm3
2197; SSE2-NEXT:    por %xmm4, %xmm3
2198; SSE2-NEXT:    movdqa %xmm3, %xmm4
2199; SSE2-NEXT:    psrlw $2, %xmm4
2200; SSE2-NEXT:    pand %xmm6, %xmm4
2201; SSE2-NEXT:    pand %xmm6, %xmm3
2202; SSE2-NEXT:    psllw $2, %xmm3
2203; SSE2-NEXT:    por %xmm4, %xmm3
2204; SSE2-NEXT:    movdqa %xmm3, %xmm4
2205; SSE2-NEXT:    psrlw $1, %xmm4
2206; SSE2-NEXT:    pand %xmm7, %xmm4
2207; SSE2-NEXT:    pand %xmm7, %xmm3
2208; SSE2-NEXT:    paddb %xmm3, %xmm3
2209; SSE2-NEXT:    por %xmm4, %xmm3
2210; SSE2-NEXT:    retq
2211;
2212; SSSE3-LABEL: test_bitreverse_v16i32:
2213; SSSE3:       # %bb.0:
2214; SSSE3-NEXT:    movdqa %xmm1, %xmm5
2215; SSSE3-NEXT:    movdqa %xmm0, %xmm1
2216; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2217; SSSE3-NEXT:    pshufb %xmm8, %xmm1
2218; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2219; SSSE3-NEXT:    movdqa %xmm1, %xmm0
2220; SSSE3-NEXT:    pand %xmm7, %xmm0
2221; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2222; SSSE3-NEXT:    movdqa %xmm6, %xmm9
2223; SSSE3-NEXT:    pshufb %xmm0, %xmm9
2224; SSSE3-NEXT:    psrlw $4, %xmm1
2225; SSSE3-NEXT:    pand %xmm7, %xmm1
2226; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2227; SSSE3-NEXT:    movdqa %xmm4, %xmm0
2228; SSSE3-NEXT:    pshufb %xmm1, %xmm0
2229; SSSE3-NEXT:    por %xmm9, %xmm0
2230; SSSE3-NEXT:    pshufb %xmm8, %xmm5
2231; SSSE3-NEXT:    movdqa %xmm5, %xmm1
2232; SSSE3-NEXT:    pand %xmm7, %xmm1
2233; SSSE3-NEXT:    movdqa %xmm6, %xmm9
2234; SSSE3-NEXT:    pshufb %xmm1, %xmm9
2235; SSSE3-NEXT:    psrlw $4, %xmm5
2236; SSSE3-NEXT:    pand %xmm7, %xmm5
2237; SSSE3-NEXT:    movdqa %xmm4, %xmm1
2238; SSSE3-NEXT:    pshufb %xmm5, %xmm1
2239; SSSE3-NEXT:    por %xmm9, %xmm1
2240; SSSE3-NEXT:    pshufb %xmm8, %xmm2
2241; SSSE3-NEXT:    movdqa %xmm2, %xmm5
2242; SSSE3-NEXT:    pand %xmm7, %xmm5
2243; SSSE3-NEXT:    movdqa %xmm6, %xmm9
2244; SSSE3-NEXT:    pshufb %xmm5, %xmm9
2245; SSSE3-NEXT:    psrlw $4, %xmm2
2246; SSSE3-NEXT:    pand %xmm7, %xmm2
2247; SSSE3-NEXT:    movdqa %xmm4, %xmm5
2248; SSSE3-NEXT:    pshufb %xmm2, %xmm5
2249; SSSE3-NEXT:    por %xmm9, %xmm5
2250; SSSE3-NEXT:    pshufb %xmm8, %xmm3
2251; SSSE3-NEXT:    movdqa %xmm3, %xmm2
2252; SSSE3-NEXT:    pand %xmm7, %xmm2
2253; SSSE3-NEXT:    pshufb %xmm2, %xmm6
2254; SSSE3-NEXT:    psrlw $4, %xmm3
2255; SSSE3-NEXT:    pand %xmm7, %xmm3
2256; SSSE3-NEXT:    pshufb %xmm3, %xmm4
2257; SSSE3-NEXT:    por %xmm6, %xmm4
2258; SSSE3-NEXT:    movdqa %xmm5, %xmm2
2259; SSSE3-NEXT:    movdqa %xmm4, %xmm3
2260; SSSE3-NEXT:    retq
2261;
2262; AVX1-LABEL: test_bitreverse_v16i32:
2263; AVX1:       # %bb.0:
2264; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2265; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2266; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2267; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2268; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2269; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2270; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2271; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2272; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2273; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2274; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2275; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2276; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2277; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
2278; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2279; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
2280; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2281; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
2282; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
2283; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2284; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2285; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2286; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2287; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2288; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2289; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2290; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2291; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2292; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2293; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
2294; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
2295; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
2296; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
2297; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
2298; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
2299; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2300; AVX1-NEXT:    retq
2301;
2302; AVX2-LABEL: test_bitreverse_v16i32:
2303; AVX2:       # %bb.0:
2304; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2305; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
2306; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2307; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2308; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
2309; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2310; AVX2-NEXT:    # ymm5 = mem[0,1,0,1]
2311; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
2312; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
2313; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
2314; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2315; AVX2-NEXT:    # ymm6 = mem[0,1,0,1]
2316; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
2317; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
2318; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2319; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
2320; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
2321; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
2322; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
2323; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
2324; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
2325; AVX2-NEXT:    retq
2326;
2327; AVX512F-LABEL: test_bitreverse_v16i32:
2328; AVX512F:       # %bb.0:
2329; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2330; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2331; AVX512F-NEXT:    # ymm2 = mem[0,1,0,1]
2332; AVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2333; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2334; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm4
2335; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2336; AVX512F-NEXT:    # ymm5 = mem[0,1,0,1]
2337; AVX512F-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
2338; AVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2339; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm2
2340; AVX512F-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
2341; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
2342; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
2343; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm1
2344; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2345; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
2346; AVX512F-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
2347; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
2348; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
2349; AVX512F-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
2350; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2351; AVX512F-NEXT:    vporq %zmm0, %zmm2, %zmm0
2352; AVX512F-NEXT:    retq
2353;
2354; AVX512BW-LABEL: test_bitreverse_v16i32:
2355; AVX512BW:       # %bb.0:
2356; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60]
2357; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2358; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
2359; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2360; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2361; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
2362; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
2363; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
2364; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2365; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2366; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
2367; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
2368; AVX512BW-NEXT:    retq
2369;
2370; XOPAVX1-LABEL: test_bitreverse_v16i32:
2371; XOPAVX1:       # %bb.0:
2372; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2373; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
2374; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2375; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2376; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2377; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2378; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2379; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2380; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2381; XOPAVX1-NEXT:    retq
2382;
2383; XOPAVX2-LABEL: test_bitreverse_v16i32:
2384; XOPAVX2:       # %bb.0:
2385; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
2386; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [83,82,81,80,87,86,85,84,91,90,89,88,95,94,93,92]
2387; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2388; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2389; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
2390; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2391; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2392; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2393; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2394; XOPAVX2-NEXT:    retq
2395;
2396; GFNISSE-LABEL: test_bitreverse_v16i32:
2397; GFNISSE:       # %bb.0:
2398; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2399; GFNISSE-NEXT:    pshufb %xmm4, %xmm0
2400; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2401; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
2402; GFNISSE-NEXT:    pshufb %xmm4, %xmm1
2403; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
2404; GFNISSE-NEXT:    pshufb %xmm4, %xmm2
2405; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
2406; GFNISSE-NEXT:    pshufb %xmm4, %xmm3
2407; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
2408; GFNISSE-NEXT:    retq
2409;
2410; GFNIAVX1-LABEL: test_bitreverse_v16i32:
2411; GFNIAVX1:       # %bb.0:
2412; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2413; GFNIAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2414; GFNIAVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2415; GFNIAVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2416; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2417; GFNIAVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2418; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
2419; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
2420; GFNIAVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
2421; GFNIAVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2422; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
2423; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
2424; GFNIAVX1-NEXT:    retq
2425;
2426; GFNIAVX2-LABEL: test_bitreverse_v16i32:
2427; GFNIAVX2:       # %bb.0:
2428; GFNIAVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2429; GFNIAVX2-NEXT:    # ymm2 = mem[0,1,0,1]
2430; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2431; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2432; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
2433; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2434; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
2435; GFNIAVX2-NEXT:    retq
2436;
2437; GFNIAVX512F-LABEL: test_bitreverse_v16i32:
2438; GFNIAVX512F:       # %bb.0:
2439; GFNIAVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2440; GFNIAVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
2441; GFNIAVX512F-NEXT:    # ymm2 = mem[0,1,0,1]
2442; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2443; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2444; GFNIAVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2445; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
2446; GFNIAVX512F-NEXT:    retq
2447;
2448; GFNIAVX512BW-LABEL: test_bitreverse_v16i32:
2449; GFNIAVX512BW:       # %bb.0:
2450; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12,19,18,17,16,23,22,21,20,27,26,25,24,31,30,29,28,35,34,33,32,39,38,37,36,43,42,41,40,47,46,45,44,51,50,49,48,55,54,53,52,59,58,57,56,63,62,61,60]
2451; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
2452; GFNIAVX512BW-NEXT:    retq
2453  %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> %a)
2454  ret <16 x i32> %b
2455}
2456
2457define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind {
2458; SSE2-LABEL: test_bitreverse_v8i64:
2459; SSE2:       # %bb.0:
2460; SSE2-NEXT:    pxor %xmm4, %xmm4
2461; SSE2-NEXT:    movdqa %xmm0, %xmm5
2462; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
2463; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,0,1]
2464; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[3,2,1,0,4,5,6,7]
2465; SSE2-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
2466; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
2467; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
2468; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
2469; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
2470; SSE2-NEXT:    packuswb %xmm5, %xmm0
2471; SSE2-NEXT:    movdqa %xmm0, %xmm6
2472; SSE2-NEXT:    psrlw $4, %xmm6
2473; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2474; SSE2-NEXT:    pand %xmm5, %xmm6
2475; SSE2-NEXT:    pand %xmm5, %xmm0
2476; SSE2-NEXT:    psllw $4, %xmm0
2477; SSE2-NEXT:    por %xmm6, %xmm0
2478; SSE2-NEXT:    movdqa %xmm0, %xmm7
2479; SSE2-NEXT:    psrlw $2, %xmm7
2480; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
2481; SSE2-NEXT:    pand %xmm6, %xmm7
2482; SSE2-NEXT:    pand %xmm6, %xmm0
2483; SSE2-NEXT:    psllw $2, %xmm0
2484; SSE2-NEXT:    por %xmm7, %xmm0
2485; SSE2-NEXT:    movdqa %xmm0, %xmm8
2486; SSE2-NEXT:    psrlw $1, %xmm8
2487; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
2488; SSE2-NEXT:    pand %xmm7, %xmm8
2489; SSE2-NEXT:    pand %xmm7, %xmm0
2490; SSE2-NEXT:    paddb %xmm0, %xmm0
2491; SSE2-NEXT:    por %xmm8, %xmm0
2492; SSE2-NEXT:    movdqa %xmm1, %xmm8
2493; SSE2-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
2494; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[2,3,0,1]
2495; SSE2-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[3,2,1,0,4,5,6,7]
2496; SSE2-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4]
2497; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
2498; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
2499; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,2,1,0,4,5,6,7]
2500; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
2501; SSE2-NEXT:    packuswb %xmm8, %xmm1
2502; SSE2-NEXT:    movdqa %xmm1, %xmm8
2503; SSE2-NEXT:    psrlw $4, %xmm8
2504; SSE2-NEXT:    pand %xmm5, %xmm8
2505; SSE2-NEXT:    pand %xmm5, %xmm1
2506; SSE2-NEXT:    psllw $4, %xmm1
2507; SSE2-NEXT:    por %xmm8, %xmm1
2508; SSE2-NEXT:    movdqa %xmm1, %xmm8
2509; SSE2-NEXT:    psrlw $2, %xmm8
2510; SSE2-NEXT:    pand %xmm6, %xmm8
2511; SSE2-NEXT:    pand %xmm6, %xmm1
2512; SSE2-NEXT:    psllw $2, %xmm1
2513; SSE2-NEXT:    por %xmm8, %xmm1
2514; SSE2-NEXT:    movdqa %xmm1, %xmm8
2515; SSE2-NEXT:    psrlw $1, %xmm8
2516; SSE2-NEXT:    pand %xmm7, %xmm8
2517; SSE2-NEXT:    pand %xmm7, %xmm1
2518; SSE2-NEXT:    paddb %xmm1, %xmm1
2519; SSE2-NEXT:    por %xmm8, %xmm1
2520; SSE2-NEXT:    movdqa %xmm2, %xmm8
2521; SSE2-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
2522; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[2,3,0,1]
2523; SSE2-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[3,2,1,0,4,5,6,7]
2524; SSE2-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4]
2525; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
2526; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
2527; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
2528; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
2529; SSE2-NEXT:    packuswb %xmm8, %xmm2
2530; SSE2-NEXT:    movdqa %xmm2, %xmm8
2531; SSE2-NEXT:    psrlw $4, %xmm8
2532; SSE2-NEXT:    pand %xmm5, %xmm8
2533; SSE2-NEXT:    pand %xmm5, %xmm2
2534; SSE2-NEXT:    psllw $4, %xmm2
2535; SSE2-NEXT:    por %xmm8, %xmm2
2536; SSE2-NEXT:    movdqa %xmm2, %xmm8
2537; SSE2-NEXT:    psrlw $2, %xmm8
2538; SSE2-NEXT:    pand %xmm6, %xmm8
2539; SSE2-NEXT:    pand %xmm6, %xmm2
2540; SSE2-NEXT:    psllw $2, %xmm2
2541; SSE2-NEXT:    por %xmm8, %xmm2
2542; SSE2-NEXT:    movdqa %xmm2, %xmm8
2543; SSE2-NEXT:    psrlw $1, %xmm8
2544; SSE2-NEXT:    pand %xmm7, %xmm8
2545; SSE2-NEXT:    pand %xmm7, %xmm2
2546; SSE2-NEXT:    paddb %xmm2, %xmm2
2547; SSE2-NEXT:    por %xmm8, %xmm2
2548; SSE2-NEXT:    movdqa %xmm3, %xmm8
2549; SSE2-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15]
2550; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[2,3,0,1]
2551; SSE2-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[3,2,1,0,4,5,6,7]
2552; SSE2-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,6,5,4]
2553; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
2554; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,3,0,1]
2555; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,2,1,0,4,5,6,7]
2556; SSE2-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
2557; SSE2-NEXT:    packuswb %xmm8, %xmm3
2558; SSE2-NEXT:    movdqa %xmm3, %xmm4
2559; SSE2-NEXT:    psrlw $4, %xmm4
2560; SSE2-NEXT:    pand %xmm5, %xmm4
2561; SSE2-NEXT:    pand %xmm5, %xmm3
2562; SSE2-NEXT:    psllw $4, %xmm3
2563; SSE2-NEXT:    por %xmm4, %xmm3
2564; SSE2-NEXT:    movdqa %xmm3, %xmm4
2565; SSE2-NEXT:    psrlw $2, %xmm4
2566; SSE2-NEXT:    pand %xmm6, %xmm4
2567; SSE2-NEXT:    pand %xmm6, %xmm3
2568; SSE2-NEXT:    psllw $2, %xmm3
2569; SSE2-NEXT:    por %xmm4, %xmm3
2570; SSE2-NEXT:    movdqa %xmm3, %xmm4
2571; SSE2-NEXT:    psrlw $1, %xmm4
2572; SSE2-NEXT:    pand %xmm7, %xmm4
2573; SSE2-NEXT:    pand %xmm7, %xmm3
2574; SSE2-NEXT:    paddb %xmm3, %xmm3
2575; SSE2-NEXT:    por %xmm4, %xmm3
2576; SSE2-NEXT:    retq
2577;
2578; SSSE3-LABEL: test_bitreverse_v8i64:
2579; SSSE3:       # %bb.0:
2580; SSSE3-NEXT:    movdqa %xmm1, %xmm5
2581; SSSE3-NEXT:    movdqa %xmm0, %xmm1
2582; SSSE3-NEXT:    movdqa {{.*#+}} xmm8 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2583; SSSE3-NEXT:    pshufb %xmm8, %xmm1
2584; SSSE3-NEXT:    movdqa {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2585; SSSE3-NEXT:    movdqa %xmm1, %xmm0
2586; SSSE3-NEXT:    pand %xmm7, %xmm0
2587; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2588; SSSE3-NEXT:    movdqa %xmm6, %xmm9
2589; SSSE3-NEXT:    pshufb %xmm0, %xmm9
2590; SSSE3-NEXT:    psrlw $4, %xmm1
2591; SSSE3-NEXT:    pand %xmm7, %xmm1
2592; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2593; SSSE3-NEXT:    movdqa %xmm4, %xmm0
2594; SSSE3-NEXT:    pshufb %xmm1, %xmm0
2595; SSSE3-NEXT:    por %xmm9, %xmm0
2596; SSSE3-NEXT:    pshufb %xmm8, %xmm5
2597; SSSE3-NEXT:    movdqa %xmm5, %xmm1
2598; SSSE3-NEXT:    pand %xmm7, %xmm1
2599; SSSE3-NEXT:    movdqa %xmm6, %xmm9
2600; SSSE3-NEXT:    pshufb %xmm1, %xmm9
2601; SSSE3-NEXT:    psrlw $4, %xmm5
2602; SSSE3-NEXT:    pand %xmm7, %xmm5
2603; SSSE3-NEXT:    movdqa %xmm4, %xmm1
2604; SSSE3-NEXT:    pshufb %xmm5, %xmm1
2605; SSSE3-NEXT:    por %xmm9, %xmm1
2606; SSSE3-NEXT:    pshufb %xmm8, %xmm2
2607; SSSE3-NEXT:    movdqa %xmm2, %xmm5
2608; SSSE3-NEXT:    pand %xmm7, %xmm5
2609; SSSE3-NEXT:    movdqa %xmm6, %xmm9
2610; SSSE3-NEXT:    pshufb %xmm5, %xmm9
2611; SSSE3-NEXT:    psrlw $4, %xmm2
2612; SSSE3-NEXT:    pand %xmm7, %xmm2
2613; SSSE3-NEXT:    movdqa %xmm4, %xmm5
2614; SSSE3-NEXT:    pshufb %xmm2, %xmm5
2615; SSSE3-NEXT:    por %xmm9, %xmm5
2616; SSSE3-NEXT:    pshufb %xmm8, %xmm3
2617; SSSE3-NEXT:    movdqa %xmm3, %xmm2
2618; SSSE3-NEXT:    pand %xmm7, %xmm2
2619; SSSE3-NEXT:    pshufb %xmm2, %xmm6
2620; SSSE3-NEXT:    psrlw $4, %xmm3
2621; SSSE3-NEXT:    pand %xmm7, %xmm3
2622; SSSE3-NEXT:    pshufb %xmm3, %xmm4
2623; SSSE3-NEXT:    por %xmm6, %xmm4
2624; SSSE3-NEXT:    movdqa %xmm5, %xmm2
2625; SSSE3-NEXT:    movdqa %xmm4, %xmm3
2626; SSSE3-NEXT:    retq
2627;
2628; AVX1-LABEL: test_bitreverse_v8i64:
2629; AVX1:       # %bb.0:
2630; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2631; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2632; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2633; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2634; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2635; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2636; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2637; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2638; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2639; AVX1-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2640; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2641; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2642; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2643; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm5
2644; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2645; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
2646; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
2647; AVX1-NEXT:    vpshufb %xmm0, %xmm7, %xmm0
2648; AVX1-NEXT:    vpor %xmm0, %xmm5, %xmm0
2649; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2650; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2651; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2652; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
2653; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
2654; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
2655; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
2656; AVX1-NEXT:    vpshufb %xmm2, %xmm7, %xmm2
2657; AVX1-NEXT:    vpor %xmm2, %xmm5, %xmm2
2658; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2659; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm3
2660; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
2661; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
2662; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
2663; AVX1-NEXT:    vpshufb %xmm1, %xmm7, %xmm1
2664; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
2665; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2666; AVX1-NEXT:    retq
2667;
2668; AVX2-LABEL: test_bitreverse_v8i64:
2669; AVX2:       # %bb.0:
2670; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2671; AVX2-NEXT:    # ymm2 = mem[0,1,0,1]
2672; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2673; AVX2-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2674; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm4
2675; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2676; AVX2-NEXT:    # ymm5 = mem[0,1,0,1]
2677; AVX2-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
2678; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm0
2679; AVX2-NEXT:    vpand %ymm3, %ymm0, %ymm0
2680; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2681; AVX2-NEXT:    # ymm6 = mem[0,1,0,1]
2682; AVX2-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
2683; AVX2-NEXT:    vpor %ymm0, %ymm4, %ymm0
2684; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2685; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm2
2686; AVX2-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
2687; AVX2-NEXT:    vpsrlw $4, %ymm1, %ymm1
2688; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
2689; AVX2-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
2690; AVX2-NEXT:    vpor %ymm1, %ymm2, %ymm1
2691; AVX2-NEXT:    retq
2692;
2693; AVX512F-LABEL: test_bitreverse_v8i64:
2694; AVX512F:       # %bb.0:
2695; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2696; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2697; AVX512F-NEXT:    # ymm2 = mem[0,1,0,1]
2698; AVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2699; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2700; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm4
2701; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2702; AVX512F-NEXT:    # ymm5 = mem[0,1,0,1]
2703; AVX512F-NEXT:    vpshufb %ymm4, %ymm5, %ymm4
2704; AVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2705; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm2
2706; AVX512F-NEXT:    vpshufb %ymm2, %ymm5, %ymm2
2707; AVX512F-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm2
2708; AVX512F-NEXT:    vpsrlw $4, %ymm1, %ymm1
2709; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm1
2710; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2711; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
2712; AVX512F-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
2713; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
2714; AVX512F-NEXT:    vpand %ymm3, %ymm0, %ymm0
2715; AVX512F-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
2716; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2717; AVX512F-NEXT:    vporq %zmm0, %zmm2, %zmm0
2718; AVX512F-NEXT:    retq
2719;
2720; AVX512BW-LABEL: test_bitreverse_v8i64:
2721; AVX512BW:       # %bb.0:
2722; AVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56]
2723; AVX512BW-NEXT:    vpbroadcastb {{.*#+}} zmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
2724; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm2
2725; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm3 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240,0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
2726; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2727; AVX512BW-NEXT:    vpshufb %zmm2, %zmm3, %zmm2
2728; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
2729; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
2730; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15,0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
2731; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2732; AVX512BW-NEXT:    vpshufb %zmm0, %zmm1, %zmm0
2733; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
2734; AVX512BW-NEXT:    retq
2735;
2736; XOPAVX1-LABEL: test_bitreverse_v8i64:
2737; XOPAVX1:       # %bb.0:
2738; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2739; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
2740; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2741; XOPAVX1-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2742; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2743; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
2744; XOPAVX1-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2745; XOPAVX1-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2746; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
2747; XOPAVX1-NEXT:    retq
2748;
2749; XOPAVX2-LABEL: test_bitreverse_v8i64:
2750; XOPAVX2:       # %bb.0:
2751; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
2752; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
2753; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2754; XOPAVX2-NEXT:    vpperm %xmm3, %xmm0, %xmm0, %xmm0
2755; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
2756; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2757; XOPAVX2-NEXT:    vpperm %xmm3, %xmm2, %xmm0, %xmm2
2758; XOPAVX2-NEXT:    vpperm %xmm3, %xmm1, %xmm0, %xmm1
2759; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
2760; XOPAVX2-NEXT:    retq
2761;
2762; GFNISSE-LABEL: test_bitreverse_v8i64:
2763; GFNISSE:       # %bb.0:
2764; GFNISSE-NEXT:    movdqa {{.*#+}} xmm4 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2765; GFNISSE-NEXT:    pshufb %xmm4, %xmm0
2766; GFNISSE-NEXT:    movdqa {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2767; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm0
2768; GFNISSE-NEXT:    pshufb %xmm4, %xmm1
2769; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm1
2770; GFNISSE-NEXT:    pshufb %xmm4, %xmm2
2771; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm2
2772; GFNISSE-NEXT:    pshufb %xmm4, %xmm3
2773; GFNISSE-NEXT:    gf2p8affineqb $0, %xmm5, %xmm3
2774; GFNISSE-NEXT:    retq
2775;
2776; GFNIAVX1-LABEL: test_bitreverse_v8i64:
2777; GFNIAVX1:       # %bb.0:
2778; GFNIAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
2779; GFNIAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2780; GFNIAVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
2781; GFNIAVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
2782; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
2783; GFNIAVX1-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2784; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm0, %ymm0
2785; GFNIAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
2786; GFNIAVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
2787; GFNIAVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
2788; GFNIAVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
2789; GFNIAVX1-NEXT:    vgf2p8affineqb $0, %ymm2, %ymm1, %ymm1
2790; GFNIAVX1-NEXT:    retq
2791;
2792; GFNIAVX2-LABEL: test_bitreverse_v8i64:
2793; GFNIAVX2:       # %bb.0:
2794; GFNIAVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2795; GFNIAVX2-NEXT:    # ymm2 = mem[0,1,0,1]
2796; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2797; GFNIAVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
2798; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm0, %ymm0
2799; GFNIAVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2800; GFNIAVX2-NEXT:    vgf2p8affineqb $0, %ymm3, %ymm1, %ymm1
2801; GFNIAVX2-NEXT:    retq
2802;
2803; GFNIAVX512F-LABEL: test_bitreverse_v8i64:
2804; GFNIAVX512F:       # %bb.0:
2805; GFNIAVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
2806; GFNIAVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8]
2807; GFNIAVX512F-NEXT:    # ymm2 = mem[0,1,0,1]
2808; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
2809; GFNIAVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
2810; GFNIAVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
2811; GFNIAVX512F-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
2812; GFNIAVX512F-NEXT:    retq
2813;
2814; GFNIAVX512BW-LABEL: test_bitreverse_v8i64:
2815; GFNIAVX512BW:       # %bb.0:
2816; GFNIAVX512BW-NEXT:    vpshufb {{.*#+}} zmm0 = zmm0[7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,23,22,21,20,19,18,17,16,31,30,29,28,27,26,25,24,39,38,37,36,35,34,33,32,47,46,45,44,43,42,41,40,55,54,53,52,51,50,49,48,63,62,61,60,59,58,57,56]
2817; GFNIAVX512BW-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
2818; GFNIAVX512BW-NEXT:    retq
2819  %b = call <8 x i64> @llvm.bitreverse.v8i64(<8 x i64> %a)
2820  ret <8 x i64> %b
2821}
2822
2823;
2824; Constant Folding
2825;
2826
2827define i32 @fold_bitreverse_i32() nounwind {
2828; ALL-LABEL: fold_bitreverse_i32:
2829; ALL:       # %bb.0:
2830; ALL-NEXT:    movl $16711935, %eax # imm = 0xFF00FF
2831; ALL-NEXT:    retq
2832  %b = call i32 @llvm.bitreverse.i32(i32 4278255360)
2833  ret i32 %b
2834}
2835
2836define <16 x i8> @fold_bitreverse_v16i8() nounwind {
2837; SSE-LABEL: fold_bitreverse_v16i8:
2838; SSE:       # %bb.0:
2839; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
2840; SSE-NEXT:    retq
2841;
2842; AVX-LABEL: fold_bitreverse_v16i8:
2843; AVX:       # %bb.0:
2844; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
2845; AVX-NEXT:    retq
2846;
2847; XOP-LABEL: fold_bitreverse_v16i8:
2848; XOP:       # %bb.0:
2849; XOP-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
2850; XOP-NEXT:    retq
2851;
2852; GFNISSE-LABEL: fold_bitreverse_v16i8:
2853; GFNISSE:       # %bb.0:
2854; GFNISSE-NEXT:    movaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
2855; GFNISSE-NEXT:    retq
2856;
2857; GFNIAVX-LABEL: fold_bitreverse_v16i8:
2858; GFNIAVX:       # %bb.0:
2859; GFNIAVX-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,64,191,32,223,96,159,16,239,80,175,48,207,112,143]
2860; GFNIAVX-NEXT:    retq
2861  %b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> <i8 0, i8 -1, i8 2, i8 -3, i8 4, i8 -5, i8 6, i8 -7, i8 8, i8 -9, i8 10, i8 -11, i8 12, i8 -13, i8 14, i8 -15>)
2862  ret <16 x i8> %b
2863}
2864
2865define <16 x i16> @fold_bitreverse_v16i16() nounwind {
2866; SSE-LABEL: fold_bitreverse_v16i16:
2867; SSE:       # %bb.0:
2868; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959]
2869; SSE-NEXT:    movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863]
2870; SSE-NEXT:    retq
2871;
2872; AVX-LABEL: fold_bitreverse_v16i16:
2873; AVX:       # %bb.0:
2874; AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
2875; AVX-NEXT:    retq
2876;
2877; XOP-LABEL: fold_bitreverse_v16i16:
2878; XOP:       # %bb.0:
2879; XOP-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
2880; XOP-NEXT:    retq
2881;
2882; GFNISSE-LABEL: fold_bitreverse_v16i16:
2883; GFNISSE:       # %bb.0:
2884; GFNISSE-NEXT:    movaps {{.*#+}} xmm0 = [0,65535,16384,49151,8192,57343,24576,40959]
2885; GFNISSE-NEXT:    movaps {{.*#+}} xmm1 = [4096,61439,20480,45055,12288,53247,28672,36863]
2886; GFNISSE-NEXT:    retq
2887;
2888; GFNIAVX-LABEL: fold_bitreverse_v16i16:
2889; GFNIAVX:       # %bb.0:
2890; GFNIAVX-NEXT:    vmovaps {{.*#+}} ymm0 = [0,65535,16384,49151,8192,57343,24576,40959,4096,61439,20480,45055,12288,53247,28672,36863]
2891; GFNIAVX-NEXT:    retq
2892  %b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> <i16 0, i16 -1, i16 2, i16 -3, i16 4, i16 -5, i16 6, i16 -7, i16 8, i16 -9, i16 10, i16 -11, i16 12, i16 -13, i16 14, i16 -15>)
2893  ret <16 x i16> %b
2894}
2895
2896define <16 x i32> @fold_bitreverse_v16i32() nounwind {
2897; SSE-LABEL: fold_bitreverse_v16i32:
2898; SSE:       # %bb.0:
2899; SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471]
2900; SSE-NEXT:    movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559]
2901; SSE-NEXT:    movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015]
2902; SSE-NEXT:    movaps {{.*#+}} xmm3 = [805306368,3489660927,1879048192,2415919103]
2903; SSE-NEXT:    retq
2904;
2905; AVX1-LABEL: fold_bitreverse_v16i32:
2906; AVX1:       # %bb.0:
2907; AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
2908; AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
2909; AVX1-NEXT:    retq
2910;
2911; AVX2-LABEL: fold_bitreverse_v16i32:
2912; AVX2:       # %bb.0:
2913; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
2914; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
2915; AVX2-NEXT:    retq
2916;
2917; AVX512-LABEL: fold_bitreverse_v16i32:
2918; AVX512:       # %bb.0:
2919; AVX512-NEXT:    vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
2920; AVX512-NEXT:    retq
2921;
2922; XOP-LABEL: fold_bitreverse_v16i32:
2923; XOP:       # %bb.0:
2924; XOP-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
2925; XOP-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
2926; XOP-NEXT:    retq
2927;
2928; GFNISSE-LABEL: fold_bitreverse_v16i32:
2929; GFNISSE:       # %bb.0:
2930; GFNISSE-NEXT:    movaps {{.*#+}} xmm0 = [0,4294967295,1073741824,3221225471]
2931; GFNISSE-NEXT:    movaps {{.*#+}} xmm1 = [536870912,3758096383,1610612736,2684354559]
2932; GFNISSE-NEXT:    movaps {{.*#+}} xmm2 = [268435456,4026531839,1342177280,2952790015]
2933; GFNISSE-NEXT:    movaps {{.*#+}} xmm3 = [805306368,3489660927,1879048192,2415919103]
2934; GFNISSE-NEXT:    retq
2935;
2936; GFNIAVX1-LABEL: fold_bitreverse_v16i32:
2937; GFNIAVX1:       # %bb.0:
2938; GFNIAVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
2939; GFNIAVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
2940; GFNIAVX1-NEXT:    retq
2941;
2942; GFNIAVX2-LABEL: fold_bitreverse_v16i32:
2943; GFNIAVX2:       # %bb.0:
2944; GFNIAVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559]
2945; GFNIAVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
2946; GFNIAVX2-NEXT:    retq
2947;
2948; GFNIAVX512-LABEL: fold_bitreverse_v16i32:
2949; GFNIAVX512:       # %bb.0:
2950; GFNIAVX512-NEXT:    vmovaps {{.*#+}} zmm0 = [0,4294967295,1073741824,3221225471,536870912,3758096383,1610612736,2684354559,268435456,4026531839,1342177280,2952790015,805306368,3489660927,1879048192,2415919103]
2951; GFNIAVX512-NEXT:    retq
2952  %b = call <16 x i32> @llvm.bitreverse.v16i32(<16 x i32> <i32 0, i32 -1, i32 2, i32 -3, i32 4, i32 -5, i32 6, i32 -7, i32 8, i32 -9, i32 10, i32 -11, i32 12, i32 -13, i32 14, i32 -15>)
2953  ret <16 x i32> %b
2954}
2955
2956declare i8 @llvm.bitreverse.i8(i8) readnone
2957declare i16 @llvm.bitreverse.i16(i16) readnone
2958declare i32 @llvm.bitreverse.i32(i32) readnone
2959declare i64 @llvm.bitreverse.i64(i64) readnone
2960
2961declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>) readnone
2962declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>) readnone
2963declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) readnone
2964declare <2 x i64> @llvm.bitreverse.v2i64(<2 x i64>) readnone
2965
2966declare <32 x i8>  @llvm.bitreverse.v32i8(<32 x i8>) readnone
2967declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>) readnone
2968declare <8 x i32>  @llvm.bitreverse.v8i32(<8 x i32>) readnone
2969declare <4 x i64>  @llvm.bitreverse.v4i64(<4 x i64>) readnone
2970
2971declare <64 x i8>  @llvm.bitreverse.v64i8(<64 x i8>) readnone
2972declare <32 x i16> @llvm.bitreverse.v32i16(<32 x i16>) readnone
2973declare <16 x i32>  @llvm.bitreverse.v16i32(<16 x i32>) readnone
2974declare <8 x i64>  @llvm.bitreverse.v8i64(<8 x i64>) readnone
2975