xref: /llvm-project/llvm/test/CodeGen/X86/bitreverse.ll (revision b34d64921b2f878b6e1ac7205fc4b13d54a7d8db)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefixes=CHECK,X86
3; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefixes=CHECK,X64
4; RUN: llc < %s -mtriple=i686-unknown -mattr=+xop | FileCheck %s --check-prefixes=CHECK,X86XOP
5; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512bw,+avx512vl,+gfni | FileCheck %s --check-prefixes=CHECK,GFNI,X86GFNI
6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512bw,+avx512vl,+gfni | FileCheck %s --check-prefixes=CHECK,GFNI,X64GFNI
7
8; These tests just check that the plumbing is in place for @llvm.bitreverse. The
9; actual output is massive at the moment as llvm.bitreverse is not yet legal.
10
11declare <2 x i16> @llvm.bitreverse.v2i16(<2 x i16>) readnone
12
13define <2 x i16> @test_bitreverse_v2i16(<2 x i16> %a) nounwind {
14; X86-LABEL: test_bitreverse_v2i16:
15; X86:       # %bb.0:
16; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
17; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
18; X86-NEXT:    rolw $8, %ax
19; X86-NEXT:    movl %eax, %edx
20; X86-NEXT:    andl $3855, %edx # imm = 0xF0F
21; X86-NEXT:    shll $4, %edx
22; X86-NEXT:    shrl $4, %eax
23; X86-NEXT:    andl $3855, %eax # imm = 0xF0F
24; X86-NEXT:    orl %edx, %eax
25; X86-NEXT:    movl %eax, %edx
26; X86-NEXT:    andl $13107, %edx # imm = 0x3333
27; X86-NEXT:    shrl $2, %eax
28; X86-NEXT:    andl $13107, %eax # imm = 0x3333
29; X86-NEXT:    leal (%eax,%edx,4), %eax
30; X86-NEXT:    movl %eax, %edx
31; X86-NEXT:    andl $21845, %edx # imm = 0x5555
32; X86-NEXT:    shrl %eax
33; X86-NEXT:    andl $21845, %eax # imm = 0x5555
34; X86-NEXT:    leal (%eax,%edx,2), %eax
35; X86-NEXT:    rolw $8, %cx
36; X86-NEXT:    movl %ecx, %edx
37; X86-NEXT:    andl $3855, %edx # imm = 0xF0F
38; X86-NEXT:    shll $4, %edx
39; X86-NEXT:    shrl $4, %ecx
40; X86-NEXT:    andl $3855, %ecx # imm = 0xF0F
41; X86-NEXT:    orl %edx, %ecx
42; X86-NEXT:    movl %ecx, %edx
43; X86-NEXT:    andl $13107, %edx # imm = 0x3333
44; X86-NEXT:    shrl $2, %ecx
45; X86-NEXT:    andl $13107, %ecx # imm = 0x3333
46; X86-NEXT:    leal (%ecx,%edx,4), %ecx
47; X86-NEXT:    movl %ecx, %edx
48; X86-NEXT:    andl $21845, %edx # imm = 0x5555
49; X86-NEXT:    shrl %ecx
50; X86-NEXT:    andl $21845, %ecx # imm = 0x5555
51; X86-NEXT:    leal (%ecx,%edx,2), %edx
52; X86-NEXT:    # kill: def $ax killed $ax killed $eax
53; X86-NEXT:    # kill: def $dx killed $dx killed $edx
54; X86-NEXT:    retl
55;
56; X64-LABEL: test_bitreverse_v2i16:
57; X64:       # %bb.0:
58; X64-NEXT:    movdqa %xmm0, %xmm1
59; X64-NEXT:    psrlw $8, %xmm1
60; X64-NEXT:    psllw $8, %xmm0
61; X64-NEXT:    por %xmm1, %xmm0
62; X64-NEXT:    movdqa %xmm0, %xmm1
63; X64-NEXT:    psrlw $4, %xmm1
64; X64-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
65; X64-NEXT:    pand %xmm2, %xmm1
66; X64-NEXT:    pand %xmm2, %xmm0
67; X64-NEXT:    psllw $4, %xmm0
68; X64-NEXT:    por %xmm1, %xmm0
69; X64-NEXT:    movdqa %xmm0, %xmm1
70; X64-NEXT:    psrlw $2, %xmm1
71; X64-NEXT:    movdqa {{.*#+}} xmm2 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
72; X64-NEXT:    pand %xmm2, %xmm1
73; X64-NEXT:    pand %xmm2, %xmm0
74; X64-NEXT:    psllw $2, %xmm0
75; X64-NEXT:    por %xmm1, %xmm0
76; X64-NEXT:    movdqa %xmm0, %xmm1
77; X64-NEXT:    psrlw $1, %xmm1
78; X64-NEXT:    movdqa {{.*#+}} xmm2 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
79; X64-NEXT:    pand %xmm2, %xmm1
80; X64-NEXT:    pand %xmm2, %xmm0
81; X64-NEXT:    paddb %xmm0, %xmm0
82; X64-NEXT:    por %xmm1, %xmm0
83; X64-NEXT:    retq
84;
85; X86XOP-LABEL: test_bitreverse_v2i16:
86; X86XOP:       # %bb.0:
87; X86XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0, %xmm0
88; X86XOP-NEXT:    retl
89;
90; X86GFNI-LABEL: test_bitreverse_v2i16:
91; X86GFNI:       # %bb.0:
92; X86GFNI-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
93; X86GFNI-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}{1to2}, %xmm0, %xmm0
94; X86GFNI-NEXT:    retl
95;
96; X64GFNI-LABEL: test_bitreverse_v2i16:
97; X64GFNI:       # %bb.0:
98; X64GFNI-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14]
99; X64GFNI-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
100; X64GFNI-NEXT:    retq
101  %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %a)
102  ret <2 x i16> %b
103}
104
105declare i64 @llvm.bitreverse.i64(i64) readnone
106
107define i64 @test_bitreverse_i64(i64 %a) nounwind {
108; X86-LABEL: test_bitreverse_i64:
109; X86:       # %bb.0:
110; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
111; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
112; X86-NEXT:    bswapl %eax
113; X86-NEXT:    movl %eax, %edx
114; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
115; X86-NEXT:    shll $4, %edx
116; X86-NEXT:    shrl $4, %eax
117; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
118; X86-NEXT:    orl %edx, %eax
119; X86-NEXT:    movl %eax, %edx
120; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
121; X86-NEXT:    shrl $2, %eax
122; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
123; X86-NEXT:    leal (%eax,%edx,4), %eax
124; X86-NEXT:    movl %eax, %edx
125; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
126; X86-NEXT:    shrl %eax
127; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
128; X86-NEXT:    leal (%eax,%edx,2), %eax
129; X86-NEXT:    bswapl %ecx
130; X86-NEXT:    movl %ecx, %edx
131; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
132; X86-NEXT:    shll $4, %edx
133; X86-NEXT:    shrl $4, %ecx
134; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
135; X86-NEXT:    orl %edx, %ecx
136; X86-NEXT:    movl %ecx, %edx
137; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
138; X86-NEXT:    shrl $2, %ecx
139; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
140; X86-NEXT:    leal (%ecx,%edx,4), %ecx
141; X86-NEXT:    movl %ecx, %edx
142; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
143; X86-NEXT:    shrl %ecx
144; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
145; X86-NEXT:    leal (%ecx,%edx,2), %edx
146; X86-NEXT:    retl
147;
148; X64-LABEL: test_bitreverse_i64:
149; X64:       # %bb.0:
150; X64-NEXT:    bswapq %rdi
151; X64-NEXT:    movq %rdi, %rax
152; X64-NEXT:    shrq $4, %rax
153; X64-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
154; X64-NEXT:    andq %rcx, %rax
155; X64-NEXT:    andq %rcx, %rdi
156; X64-NEXT:    shlq $4, %rdi
157; X64-NEXT:    orq %rax, %rdi
158; X64-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
159; X64-NEXT:    movq %rdi, %rcx
160; X64-NEXT:    andq %rax, %rcx
161; X64-NEXT:    shrq $2, %rdi
162; X64-NEXT:    andq %rax, %rdi
163; X64-NEXT:    leaq (%rdi,%rcx,4), %rax
164; X64-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
165; X64-NEXT:    movq %rax, %rdx
166; X64-NEXT:    andq %rcx, %rdx
167; X64-NEXT:    shrq %rax
168; X64-NEXT:    andq %rcx, %rax
169; X64-NEXT:    leaq (%rax,%rdx,2), %rax
170; X64-NEXT:    retq
171;
172; X86XOP-LABEL: test_bitreverse_i64:
173; X86XOP:       # %bb.0:
174; X86XOP-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
175; X86XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0, %xmm0
176; X86XOP-NEXT:    vmovd %xmm0, %eax
177; X86XOP-NEXT:    vpextrd $1, %xmm0, %edx
178; X86XOP-NEXT:    retl
179;
180; X86GFNI-LABEL: test_bitreverse_i64:
181; X86GFNI:       # %bb.0:
182; X86GFNI-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
183; X86GFNI-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}{1to2}, %xmm0, %xmm0
184; X86GFNI-NEXT:    vpextrd $1, %xmm0, %eax
185; X86GFNI-NEXT:    bswapl %eax
186; X86GFNI-NEXT:    vmovd %xmm0, %edx
187; X86GFNI-NEXT:    bswapl %edx
188; X86GFNI-NEXT:    retl
189;
190; X64GFNI-LABEL: test_bitreverse_i64:
191; X64GFNI:       # %bb.0:
192; X64GFNI-NEXT:    vmovq %rdi, %xmm0
193; X64GFNI-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
194; X64GFNI-NEXT:    vmovq %xmm0, %rax
195; X64GFNI-NEXT:    bswapq %rax
196; X64GFNI-NEXT:    retq
197  %b = call i64 @llvm.bitreverse.i64(i64 %a)
198  ret i64 %b
199}
200
201declare i32 @llvm.bitreverse.i32(i32) readnone
202
203define i32 @test_bitreverse_i32(i32 %a) nounwind {
204; X86-LABEL: test_bitreverse_i32:
205; X86:       # %bb.0:
206; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
207; X86-NEXT:    bswapl %eax
208; X86-NEXT:    movl %eax, %ecx
209; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
210; X86-NEXT:    shll $4, %ecx
211; X86-NEXT:    shrl $4, %eax
212; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
213; X86-NEXT:    orl %ecx, %eax
214; X86-NEXT:    movl %eax, %ecx
215; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
216; X86-NEXT:    shrl $2, %eax
217; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
218; X86-NEXT:    leal (%eax,%ecx,4), %eax
219; X86-NEXT:    movl %eax, %ecx
220; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
221; X86-NEXT:    shrl %eax
222; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
223; X86-NEXT:    leal (%eax,%ecx,2), %eax
224; X86-NEXT:    retl
225;
226; X64-LABEL: test_bitreverse_i32:
227; X64:       # %bb.0:
228; X64-NEXT:    # kill: def $edi killed $edi def $rdi
229; X64-NEXT:    bswapl %edi
230; X64-NEXT:    movl %edi, %eax
231; X64-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
232; X64-NEXT:    shll $4, %eax
233; X64-NEXT:    shrl $4, %edi
234; X64-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
235; X64-NEXT:    orl %eax, %edi
236; X64-NEXT:    movl %edi, %eax
237; X64-NEXT:    andl $858993459, %eax # imm = 0x33333333
238; X64-NEXT:    shrl $2, %edi
239; X64-NEXT:    andl $858993459, %edi # imm = 0x33333333
240; X64-NEXT:    leal (%rdi,%rax,4), %eax
241; X64-NEXT:    movl %eax, %ecx
242; X64-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
243; X64-NEXT:    shrl %eax
244; X64-NEXT:    andl $1431655765, %eax # imm = 0x55555555
245; X64-NEXT:    leal (%rax,%rcx,2), %eax
246; X64-NEXT:    retq
247;
248; X86XOP-LABEL: test_bitreverse_i32:
249; X86XOP:       # %bb.0:
250; X86XOP-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
251; X86XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0, %xmm0
252; X86XOP-NEXT:    vmovd %xmm0, %eax
253; X86XOP-NEXT:    retl
254;
255; X86GFNI-LABEL: test_bitreverse_i32:
256; X86GFNI:       # %bb.0:
257; X86GFNI-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
258; X86GFNI-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}{1to2}, %xmm0, %xmm0
259; X86GFNI-NEXT:    vmovd %xmm0, %eax
260; X86GFNI-NEXT:    bswapl %eax
261; X86GFNI-NEXT:    retl
262;
263; X64GFNI-LABEL: test_bitreverse_i32:
264; X64GFNI:       # %bb.0:
265; X64GFNI-NEXT:    vmovd %edi, %xmm0
266; X64GFNI-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
267; X64GFNI-NEXT:    vmovd %xmm0, %eax
268; X64GFNI-NEXT:    bswapl %eax
269; X64GFNI-NEXT:    retq
270  %b = call i32 @llvm.bitreverse.i32(i32 %a)
271  ret i32 %b
272}
273
274declare i24 @llvm.bitreverse.i24(i24) readnone
275
276define i24 @test_bitreverse_i24(i24 %a) nounwind {
277; X86-LABEL: test_bitreverse_i24:
278; X86:       # %bb.0:
279; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
280; X86-NEXT:    bswapl %eax
281; X86-NEXT:    movl %eax, %ecx
282; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
283; X86-NEXT:    shll $4, %ecx
284; X86-NEXT:    shrl $4, %eax
285; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
286; X86-NEXT:    orl %ecx, %eax
287; X86-NEXT:    movl %eax, %ecx
288; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
289; X86-NEXT:    shrl $2, %eax
290; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
291; X86-NEXT:    leal (%eax,%ecx,4), %eax
292; X86-NEXT:    movl %eax, %ecx
293; X86-NEXT:    andl $1431655680, %ecx # imm = 0x55555500
294; X86-NEXT:    shrl %eax
295; X86-NEXT:    andl $1431655680, %eax # imm = 0x55555500
296; X86-NEXT:    leal (%eax,%ecx,2), %eax
297; X86-NEXT:    shrl $8, %eax
298; X86-NEXT:    retl
299;
300; X64-LABEL: test_bitreverse_i24:
301; X64:       # %bb.0:
302; X64-NEXT:    # kill: def $edi killed $edi def $rdi
303; X64-NEXT:    bswapl %edi
304; X64-NEXT:    movl %edi, %eax
305; X64-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
306; X64-NEXT:    shll $4, %eax
307; X64-NEXT:    shrl $4, %edi
308; X64-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
309; X64-NEXT:    orl %eax, %edi
310; X64-NEXT:    movl %edi, %eax
311; X64-NEXT:    andl $858993459, %eax # imm = 0x33333333
312; X64-NEXT:    shrl $2, %edi
313; X64-NEXT:    andl $858993459, %edi # imm = 0x33333333
314; X64-NEXT:    leal (%rdi,%rax,4), %eax
315; X64-NEXT:    movl %eax, %ecx
316; X64-NEXT:    andl $1431655680, %ecx # imm = 0x55555500
317; X64-NEXT:    shrl %eax
318; X64-NEXT:    andl $1431655680, %eax # imm = 0x55555500
319; X64-NEXT:    leal (%rax,%rcx,2), %eax
320; X64-NEXT:    shrl $8, %eax
321; X64-NEXT:    retq
322;
323; X86XOP-LABEL: test_bitreverse_i24:
324; X86XOP:       # %bb.0:
325; X86XOP-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
326; X86XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0, %xmm0
327; X86XOP-NEXT:    vmovd %xmm0, %eax
328; X86XOP-NEXT:    shrl $8, %eax
329; X86XOP-NEXT:    retl
330;
331; X86GFNI-LABEL: test_bitreverse_i24:
332; X86GFNI:       # %bb.0:
333; X86GFNI-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
334; X86GFNI-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}{1to2}, %xmm0, %xmm0
335; X86GFNI-NEXT:    vmovd %xmm0, %eax
336; X86GFNI-NEXT:    bswapl %eax
337; X86GFNI-NEXT:    shrl $8, %eax
338; X86GFNI-NEXT:    retl
339;
340; X64GFNI-LABEL: test_bitreverse_i24:
341; X64GFNI:       # %bb.0:
342; X64GFNI-NEXT:    vmovd %edi, %xmm0
343; X64GFNI-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
344; X64GFNI-NEXT:    vmovd %xmm0, %eax
345; X64GFNI-NEXT:    bswapl %eax
346; X64GFNI-NEXT:    shrl $8, %eax
347; X64GFNI-NEXT:    retq
348  %b = call i24 @llvm.bitreverse.i24(i24 %a)
349  ret i24 %b
350}
351
352declare i16 @llvm.bitreverse.i16(i16) readnone
353
354define i16 @test_bitreverse_i16(i16 %a) nounwind {
355; X86-LABEL: test_bitreverse_i16:
356; X86:       # %bb.0:
357; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
358; X86-NEXT:    rolw $8, %ax
359; X86-NEXT:    movl %eax, %ecx
360; X86-NEXT:    andl $3855, %ecx # imm = 0xF0F
361; X86-NEXT:    shll $4, %ecx
362; X86-NEXT:    shrl $4, %eax
363; X86-NEXT:    andl $3855, %eax # imm = 0xF0F
364; X86-NEXT:    orl %ecx, %eax
365; X86-NEXT:    movl %eax, %ecx
366; X86-NEXT:    andl $13107, %ecx # imm = 0x3333
367; X86-NEXT:    shrl $2, %eax
368; X86-NEXT:    andl $13107, %eax # imm = 0x3333
369; X86-NEXT:    leal (%eax,%ecx,4), %eax
370; X86-NEXT:    movl %eax, %ecx
371; X86-NEXT:    andl $21845, %ecx # imm = 0x5555
372; X86-NEXT:    shrl %eax
373; X86-NEXT:    andl $21845, %eax # imm = 0x5555
374; X86-NEXT:    leal (%eax,%ecx,2), %eax
375; X86-NEXT:    # kill: def $ax killed $ax killed $eax
376; X86-NEXT:    retl
377;
378; X64-LABEL: test_bitreverse_i16:
379; X64:       # %bb.0:
380; X64-NEXT:    # kill: def $edi killed $edi def $rdi
381; X64-NEXT:    rolw $8, %di
382; X64-NEXT:    movl %edi, %eax
383; X64-NEXT:    andl $3855, %eax # imm = 0xF0F
384; X64-NEXT:    shll $4, %eax
385; X64-NEXT:    shrl $4, %edi
386; X64-NEXT:    andl $3855, %edi # imm = 0xF0F
387; X64-NEXT:    orl %eax, %edi
388; X64-NEXT:    movl %edi, %eax
389; X64-NEXT:    andl $13107, %eax # imm = 0x3333
390; X64-NEXT:    shrl $2, %edi
391; X64-NEXT:    andl $13107, %edi # imm = 0x3333
392; X64-NEXT:    leal (%rdi,%rax,4), %eax
393; X64-NEXT:    movl %eax, %ecx
394; X64-NEXT:    andl $21845, %ecx # imm = 0x5555
395; X64-NEXT:    shrl %eax
396; X64-NEXT:    andl $21845, %eax # imm = 0x5555
397; X64-NEXT:    leal (%rax,%rcx,2), %eax
398; X64-NEXT:    # kill: def $ax killed $ax killed $eax
399; X64-NEXT:    retq
400;
401; X86XOP-LABEL: test_bitreverse_i16:
402; X86XOP:       # %bb.0:
403; X86XOP-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
404; X86XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0, %xmm0
405; X86XOP-NEXT:    vmovd %xmm0, %eax
406; X86XOP-NEXT:    # kill: def $ax killed $ax killed $eax
407; X86XOP-NEXT:    retl
408;
409; X86GFNI-LABEL: test_bitreverse_i16:
410; X86GFNI:       # %bb.0:
411; X86GFNI-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
412; X86GFNI-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}{1to2}, %xmm0, %xmm0
413; X86GFNI-NEXT:    vmovd %xmm0, %eax
414; X86GFNI-NEXT:    rolw $8, %ax
415; X86GFNI-NEXT:    # kill: def $ax killed $ax killed $eax
416; X86GFNI-NEXT:    retl
417;
418; X64GFNI-LABEL: test_bitreverse_i16:
419; X64GFNI:       # %bb.0:
420; X64GFNI-NEXT:    vmovd %edi, %xmm0
421; X64GFNI-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
422; X64GFNI-NEXT:    vmovd %xmm0, %eax
423; X64GFNI-NEXT:    rolw $8, %ax
424; X64GFNI-NEXT:    # kill: def $ax killed $ax killed $eax
425; X64GFNI-NEXT:    retq
426  %b = call i16 @llvm.bitreverse.i16(i16 %a)
427  ret i16 %b
428}
429
430declare i8 @llvm.bitreverse.i8(i8) readnone
431
432define i8 @test_bitreverse_i8(i8 %a) {
433; X86-LABEL: test_bitreverse_i8:
434; X86:       # %bb.0:
435; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
436; X86-NEXT:    rolb $4, %al
437; X86-NEXT:    movl %eax, %ecx
438; X86-NEXT:    andb $51, %cl
439; X86-NEXT:    shlb $2, %cl
440; X86-NEXT:    shrb $2, %al
441; X86-NEXT:    andb $51, %al
442; X86-NEXT:    orb %cl, %al
443; X86-NEXT:    movl %eax, %ecx
444; X86-NEXT:    andb $85, %cl
445; X86-NEXT:    addb %cl, %cl
446; X86-NEXT:    shrb %al
447; X86-NEXT:    andb $85, %al
448; X86-NEXT:    orb %cl, %al
449; X86-NEXT:    retl
450;
451; X64-LABEL: test_bitreverse_i8:
452; X64:       # %bb.0:
453; X64-NEXT:    rolb $4, %dil
454; X64-NEXT:    movl %edi, %eax
455; X64-NEXT:    andb $51, %al
456; X64-NEXT:    shlb $2, %al
457; X64-NEXT:    shrb $2, %dil
458; X64-NEXT:    andb $51, %dil
459; X64-NEXT:    orb %dil, %al
460; X64-NEXT:    movl %eax, %ecx
461; X64-NEXT:    andb $85, %cl
462; X64-NEXT:    addb %cl, %cl
463; X64-NEXT:    shrb %al
464; X64-NEXT:    andb $85, %al
465; X64-NEXT:    orb %cl, %al
466; X64-NEXT:    retq
467;
468; X86XOP-LABEL: test_bitreverse_i8:
469; X86XOP:       # %bb.0:
470; X86XOP-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
471; X86XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0, %xmm0
472; X86XOP-NEXT:    vmovd %xmm0, %eax
473; X86XOP-NEXT:    # kill: def $al killed $al killed $eax
474; X86XOP-NEXT:    retl
475;
476; X86GFNI-LABEL: test_bitreverse_i8:
477; X86GFNI:       # %bb.0:
478; X86GFNI-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
479; X86GFNI-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}{1to2}, %xmm0, %xmm0
480; X86GFNI-NEXT:    vmovd %xmm0, %eax
481; X86GFNI-NEXT:    # kill: def $al killed $al killed $eax
482; X86GFNI-NEXT:    retl
483;
484; X64GFNI-LABEL: test_bitreverse_i8:
485; X64GFNI:       # %bb.0:
486; X64GFNI-NEXT:    vmovd %edi, %xmm0
487; X64GFNI-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
488; X64GFNI-NEXT:    vmovd %xmm0, %eax
489; X64GFNI-NEXT:    # kill: def $al killed $al killed $eax
490; X64GFNI-NEXT:    retq
491  %b = call i8 @llvm.bitreverse.i8(i8 %a)
492  ret i8 %b
493}
494
495declare i4 @llvm.bitreverse.i4(i4) readnone
496
497define i4 @test_bitreverse_i4(i4 %a) {
498; X86-LABEL: test_bitreverse_i4:
499; X86:       # %bb.0:
500; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
501; X86-NEXT:    movl %ecx, %eax
502; X86-NEXT:    andb $8, %al
503; X86-NEXT:    movl %ecx, %edx
504; X86-NEXT:    addb %cl, %dl
505; X86-NEXT:    andb $4, %dl
506; X86-NEXT:    movb %cl, %ah
507; X86-NEXT:    shlb $3, %ah
508; X86-NEXT:    andb $8, %ah
509; X86-NEXT:    orb %dl, %ah
510; X86-NEXT:    shrb %cl
511; X86-NEXT:    andb $2, %cl
512; X86-NEXT:    orb %ah, %cl
513; X86-NEXT:    shrb $3, %al
514; X86-NEXT:    orb %cl, %al
515; X86-NEXT:    retl
516;
517; X64-LABEL: test_bitreverse_i4:
518; X64:       # %bb.0:
519; X64-NEXT:    # kill: def $edi killed $edi def $rdi
520; X64-NEXT:    movl %edi, %eax
521; X64-NEXT:    andb $8, %al
522; X64-NEXT:    leal (%rdi,%rdi), %ecx
523; X64-NEXT:    andb $4, %cl
524; X64-NEXT:    leal (,%rdi,8), %edx
525; X64-NEXT:    andb $8, %dl
526; X64-NEXT:    orb %cl, %dl
527; X64-NEXT:    shrb %dil
528; X64-NEXT:    andb $2, %dil
529; X64-NEXT:    orb %dil, %dl
530; X64-NEXT:    shrb $3, %al
531; X64-NEXT:    orb %dl, %al
532; X64-NEXT:    retq
533;
534; X86XOP-LABEL: test_bitreverse_i4:
535; X86XOP:       # %bb.0:
536; X86XOP-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
537; X86XOP-NEXT:    vpperm {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0, %xmm0
538; X86XOP-NEXT:    vmovd %xmm0, %eax
539; X86XOP-NEXT:    shrb $4, %al
540; X86XOP-NEXT:    # kill: def $al killed $al killed $eax
541; X86XOP-NEXT:    retl
542;
543; X86GFNI-LABEL: test_bitreverse_i4:
544; X86GFNI:       # %bb.0:
545; X86GFNI-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
546; X86GFNI-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}{1to2}, %xmm0, %xmm0
547; X86GFNI-NEXT:    vmovd %xmm0, %eax
548; X86GFNI-NEXT:    shrb $4, %al
549; X86GFNI-NEXT:    # kill: def $al killed $al killed $eax
550; X86GFNI-NEXT:    retl
551;
552; X64GFNI-LABEL: test_bitreverse_i4:
553; X64GFNI:       # %bb.0:
554; X64GFNI-NEXT:    vmovd %edi, %xmm0
555; X64GFNI-NEXT:    vgf2p8affineqb $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
556; X64GFNI-NEXT:    vmovd %xmm0, %eax
557; X64GFNI-NEXT:    shrb $4, %al
558; X64GFNI-NEXT:    # kill: def $al killed $al killed $eax
559; X64GFNI-NEXT:    retq
560  %b = call i4 @llvm.bitreverse.i4(i4 %a)
561  ret i4 %b
562}
563
564; These tests check that bitreverse(constant) calls are folded
565
566define <2 x i16> @fold_v2i16() {
567; X86-LABEL: fold_v2i16:
568; X86:       # %bb.0:
569; X86-NEXT:    movw $-4096, %ax # imm = 0xF000
570; X86-NEXT:    movw $240, %dx
571; X86-NEXT:    retl
572;
573; X64-LABEL: fold_v2i16:
574; X64:       # %bb.0:
575; X64-NEXT:    movss {{.*#+}} xmm0 = [61440,240,0,0,0,0,0,0]
576; X64-NEXT:    retq
577;
578; X86XOP-LABEL: fold_v2i16:
579; X86XOP:       # %bb.0:
580; X86XOP-NEXT:    vmovss {{.*#+}} xmm0 = [61440,240,0,0,0,0,0,0]
581; X86XOP-NEXT:    retl
582;
583; GFNI-LABEL: fold_v2i16:
584; GFNI:       # %bb.0:
585; GFNI-NEXT:    vmovss {{.*#+}} xmm0 = [61440,240,0,0,0,0,0,0]
586; GFNI-NEXT:    ret{{[l|q]}}
587  %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> <i16 15, i16 3840>)
588  ret <2 x i16> %b
589}
590
591define i24 @fold_i24() {
592; CHECK-LABEL: fold_i24:
593; CHECK:       # %bb.0:
594; CHECK-NEXT:    movl $2048, %eax # imm = 0x800
595; CHECK-NEXT:    ret{{[l|q]}}
596  %b = call i24 @llvm.bitreverse.i24(i24 4096)
597  ret i24 %b
598}
599
600define i8 @fold_i8() {
601; CHECK-LABEL: fold_i8:
602; CHECK:       # %bb.0:
603; CHECK-NEXT:    movb $-16, %al
604; CHECK-NEXT:    ret{{[l|q]}}
605  %b = call i8 @llvm.bitreverse.i8(i8 15)
606  ret i8 %b
607}
608
609define i4 @fold_i4() {
610; CHECK-LABEL: fold_i4:
611; CHECK:       # %bb.0:
612; CHECK-NEXT:    movb $1, %al
613; CHECK-NEXT:    ret{{[l|q]}}
614  %b = call i4 @llvm.bitreverse.i4(i4 8)
615  ret i4 %b
616}
617
618; These tests check that bitreverse(bitreverse()) calls are removed
619
620define i8 @identity_i8(i8 %a) {
621; X86-LABEL: identity_i8:
622; X86:       # %bb.0:
623; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
624; X86-NEXT:    retl
625;
626; X64-LABEL: identity_i8:
627; X64:       # %bb.0:
628; X64-NEXT:    movl %edi, %eax
629; X64-NEXT:    # kill: def $al killed $al killed $eax
630; X64-NEXT:    retq
631;
632; X86XOP-LABEL: identity_i8:
633; X86XOP:       # %bb.0:
634; X86XOP-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
635; X86XOP-NEXT:    retl
636;
637; X86GFNI-LABEL: identity_i8:
638; X86GFNI:       # %bb.0:
639; X86GFNI-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
640; X86GFNI-NEXT:    retl
641;
642; X64GFNI-LABEL: identity_i8:
643; X64GFNI:       # %bb.0:
644; X64GFNI-NEXT:    movl %edi, %eax
645; X64GFNI-NEXT:    # kill: def $al killed $al killed $eax
646; X64GFNI-NEXT:    retq
647  %b = call i8 @llvm.bitreverse.i8(i8 %a)
648  %c = call i8 @llvm.bitreverse.i8(i8 %b)
649  ret i8 %c
650}
651
652define <2 x i16> @identity_v2i16(<2 x i16> %a) {
653; X86-LABEL: identity_v2i16:
654; X86:       # %bb.0:
655; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
656; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
657; X86-NEXT:    retl
658;
659; X64-LABEL: identity_v2i16:
660; X64:       # %bb.0:
661; X64-NEXT:    retq
662;
663; X86XOP-LABEL: identity_v2i16:
664; X86XOP:       # %bb.0:
665; X86XOP-NEXT:    retl
666;
667; GFNI-LABEL: identity_v2i16:
668; GFNI:       # %bb.0:
669; GFNI-NEXT:    ret{{[l|q]}}
670  %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %a)
671  %c = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %b)
672  ret <2 x i16> %c
673}
674
675; These tests check that bitreverse(undef) calls are removed
676
677define i8 @undef_i8() {
678; CHECK-LABEL: undef_i8:
679; CHECK:       # %bb.0:
680; CHECK-NEXT:    ret{{[l|q]}}
681  %b = call i8 @llvm.bitreverse.i8(i8 undef)
682  ret i8 %b
683}
684
685define <2 x i16> @undef_v2i16() {
686; CHECK-LABEL: undef_v2i16:
687; CHECK:       # %bb.0:
688; CHECK-NEXT:    ret{{[l|q]}}
689  %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> undef)
690  ret <2 x i16> %b
691}
692
693; Make sure we don't assert during type legalization promoting a large
694; bitreverse due to the need for a large shift that won't fit in the i8 returned
695; from getShiftAmountTy.
696define i528 @large_promotion(i528 %A) nounwind {
697; X86-LABEL: large_promotion:
698; X86:       # %bb.0:
699; X86-NEXT:    pushl %ebp
700; X86-NEXT:    pushl %ebx
701; X86-NEXT:    pushl %edi
702; X86-NEXT:    pushl %esi
703; X86-NEXT:    subl $60, %esp
704; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
705; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
706; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
707; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
708; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
709; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
710; X86-NEXT:    bswapl %ebx
711; X86-NEXT:    movl %ebx, %ebp
712; X86-NEXT:    andl $252645135, %ebp # imm = 0xF0F0F0F
713; X86-NEXT:    shll $4, %ebp
714; X86-NEXT:    shrl $4, %ebx
715; X86-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
716; X86-NEXT:    orl %ebp, %ebx
717; X86-NEXT:    movl %ebx, %ebp
718; X86-NEXT:    andl $858993459, %ebp # imm = 0x33333333
719; X86-NEXT:    shrl $2, %ebx
720; X86-NEXT:    andl $858993459, %ebx # imm = 0x33333333
721; X86-NEXT:    leal (%ebx,%ebp,4), %ebx
722; X86-NEXT:    movl %ebx, %ebp
723; X86-NEXT:    andl $1431633920, %ebp # imm = 0x55550000
724; X86-NEXT:    shrl %ebx
725; X86-NEXT:    andl $1431633920, %ebx # imm = 0x55550000
726; X86-NEXT:    leal (%ebx,%ebp,2), %ebp
727; X86-NEXT:    bswapl %edi
728; X86-NEXT:    movl %edi, %ebx
729; X86-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
730; X86-NEXT:    shll $4, %ebx
731; X86-NEXT:    shrl $4, %edi
732; X86-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
733; X86-NEXT:    orl %ebx, %edi
734; X86-NEXT:    movl %edi, %ebx
735; X86-NEXT:    andl $858993459, %ebx # imm = 0x33333333
736; X86-NEXT:    shrl $2, %edi
737; X86-NEXT:    andl $858993459, %edi # imm = 0x33333333
738; X86-NEXT:    leal (%edi,%ebx,4), %edi
739; X86-NEXT:    movl %edi, %ebx
740; X86-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
741; X86-NEXT:    shrl %edi
742; X86-NEXT:    andl $1431655765, %edi # imm = 0x55555555
743; X86-NEXT:    leal (%edi,%ebx,2), %edi
744; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
745; X86-NEXT:    bswapl %esi
746; X86-NEXT:    movl %esi, %edi
747; X86-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
748; X86-NEXT:    shll $4, %edi
749; X86-NEXT:    shrl $4, %esi
750; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
751; X86-NEXT:    orl %edi, %esi
752; X86-NEXT:    movl %esi, %edi
753; X86-NEXT:    andl $858993459, %edi # imm = 0x33333333
754; X86-NEXT:    shrl $2, %esi
755; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
756; X86-NEXT:    leal (%esi,%edi,4), %esi
757; X86-NEXT:    movl %esi, %edi
758; X86-NEXT:    andl $1431655765, %edi # imm = 0x55555555
759; X86-NEXT:    shrl %esi
760; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
761; X86-NEXT:    leal (%esi,%edi,2), %ebx
762; X86-NEXT:    bswapl %edx
763; X86-NEXT:    movl %edx, %esi
764; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
765; X86-NEXT:    shll $4, %esi
766; X86-NEXT:    shrl $4, %edx
767; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
768; X86-NEXT:    orl %esi, %edx
769; X86-NEXT:    movl %edx, %esi
770; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
771; X86-NEXT:    shrl $2, %edx
772; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
773; X86-NEXT:    leal (%edx,%esi,4), %edx
774; X86-NEXT:    movl %edx, %esi
775; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
776; X86-NEXT:    shrl %edx
777; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
778; X86-NEXT:    leal (%edx,%esi,2), %edx
779; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
780; X86-NEXT:    bswapl %ecx
781; X86-NEXT:    movl %ecx, %edx
782; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
783; X86-NEXT:    shll $4, %edx
784; X86-NEXT:    shrl $4, %ecx
785; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
786; X86-NEXT:    orl %edx, %ecx
787; X86-NEXT:    movl %ecx, %edx
788; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
789; X86-NEXT:    shrl $2, %ecx
790; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
791; X86-NEXT:    leal (%ecx,%edx,4), %ecx
792; X86-NEXT:    movl %ecx, %edx
793; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
794; X86-NEXT:    shrl %ecx
795; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
796; X86-NEXT:    leal (%ecx,%edx,2), %ecx
797; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
798; X86-NEXT:    bswapl %eax
799; X86-NEXT:    movl %eax, %ecx
800; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
801; X86-NEXT:    shll $4, %ecx
802; X86-NEXT:    shrl $4, %eax
803; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
804; X86-NEXT:    orl %ecx, %eax
805; X86-NEXT:    movl %eax, %ecx
806; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
807; X86-NEXT:    shrl $2, %eax
808; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
809; X86-NEXT:    leal (%eax,%ecx,4), %eax
810; X86-NEXT:    movl %eax, %ecx
811; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
812; X86-NEXT:    shrl %eax
813; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
814; X86-NEXT:    leal (%eax,%ecx,2), %eax
815; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
816; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
817; X86-NEXT:    bswapl %eax
818; X86-NEXT:    movl %eax, %ecx
819; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
820; X86-NEXT:    shll $4, %ecx
821; X86-NEXT:    shrl $4, %eax
822; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
823; X86-NEXT:    orl %ecx, %eax
824; X86-NEXT:    movl %eax, %ecx
825; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
826; X86-NEXT:    shrl $2, %eax
827; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
828; X86-NEXT:    leal (%eax,%ecx,4), %eax
829; X86-NEXT:    movl %eax, %ecx
830; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
831; X86-NEXT:    shrl %eax
832; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
833; X86-NEXT:    leal (%eax,%ecx,2), %eax
834; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
835; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
836; X86-NEXT:    bswapl %eax
837; X86-NEXT:    movl %eax, %ecx
838; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
839; X86-NEXT:    shll $4, %ecx
840; X86-NEXT:    shrl $4, %eax
841; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
842; X86-NEXT:    orl %ecx, %eax
843; X86-NEXT:    movl %eax, %ecx
844; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
845; X86-NEXT:    shrl $2, %eax
846; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
847; X86-NEXT:    leal (%eax,%ecx,4), %eax
848; X86-NEXT:    movl %eax, %ecx
849; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
850; X86-NEXT:    shrl %eax
851; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
852; X86-NEXT:    leal (%eax,%ecx,2), %eax
853; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
854; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
855; X86-NEXT:    bswapl %eax
856; X86-NEXT:    movl %eax, %ecx
857; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
858; X86-NEXT:    shll $4, %ecx
859; X86-NEXT:    shrl $4, %eax
860; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
861; X86-NEXT:    orl %ecx, %eax
862; X86-NEXT:    movl %eax, %ecx
863; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
864; X86-NEXT:    shrl $2, %eax
865; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
866; X86-NEXT:    leal (%eax,%ecx,4), %eax
867; X86-NEXT:    movl %eax, %ecx
868; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
869; X86-NEXT:    shrl %eax
870; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
871; X86-NEXT:    leal (%eax,%ecx,2), %eax
872; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
873; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
874; X86-NEXT:    bswapl %eax
875; X86-NEXT:    movl %eax, %ecx
876; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
877; X86-NEXT:    shll $4, %ecx
878; X86-NEXT:    shrl $4, %eax
879; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
880; X86-NEXT:    orl %ecx, %eax
881; X86-NEXT:    movl %eax, %ecx
882; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
883; X86-NEXT:    shrl $2, %eax
884; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
885; X86-NEXT:    leal (%eax,%ecx,4), %eax
886; X86-NEXT:    movl %eax, %ecx
887; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
888; X86-NEXT:    shrl %eax
889; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
890; X86-NEXT:    leal (%eax,%ecx,2), %edi
891; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
892; X86-NEXT:    bswapl %eax
893; X86-NEXT:    movl %eax, %ecx
894; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
895; X86-NEXT:    shll $4, %ecx
896; X86-NEXT:    shrl $4, %eax
897; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
898; X86-NEXT:    orl %ecx, %eax
899; X86-NEXT:    movl %eax, %ecx
900; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
901; X86-NEXT:    shrl $2, %eax
902; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
903; X86-NEXT:    leal (%eax,%ecx,4), %eax
904; X86-NEXT:    movl %eax, %ecx
905; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
906; X86-NEXT:    shrl %eax
907; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
908; X86-NEXT:    leal (%eax,%ecx,2), %eax
909; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
910; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
911; X86-NEXT:    bswapl %eax
912; X86-NEXT:    movl %eax, %ecx
913; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
914; X86-NEXT:    shll $4, %ecx
915; X86-NEXT:    shrl $4, %eax
916; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
917; X86-NEXT:    orl %ecx, %eax
918; X86-NEXT:    movl %eax, %ecx
919; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
920; X86-NEXT:    shrl $2, %eax
921; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
922; X86-NEXT:    leal (%eax,%ecx,4), %eax
923; X86-NEXT:    movl %eax, %ecx
924; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
925; X86-NEXT:    shrl %eax
926; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
927; X86-NEXT:    leal (%eax,%ecx,2), %eax
928; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
929; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
930; X86-NEXT:    bswapl %eax
931; X86-NEXT:    movl %eax, %ecx
932; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
933; X86-NEXT:    shll $4, %ecx
934; X86-NEXT:    shrl $4, %eax
935; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
936; X86-NEXT:    orl %ecx, %eax
937; X86-NEXT:    movl %eax, %ecx
938; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
939; X86-NEXT:    shrl $2, %eax
940; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
941; X86-NEXT:    leal (%eax,%ecx,4), %eax
942; X86-NEXT:    movl %eax, %ecx
943; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
944; X86-NEXT:    shrl %eax
945; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
946; X86-NEXT:    leal (%eax,%ecx,2), %eax
947; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
948; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
949; X86-NEXT:    bswapl %eax
950; X86-NEXT:    movl %eax, %ecx
951; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
952; X86-NEXT:    shll $4, %ecx
953; X86-NEXT:    shrl $4, %eax
954; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
955; X86-NEXT:    orl %ecx, %eax
956; X86-NEXT:    movl %eax, %ecx
957; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
958; X86-NEXT:    shrl $2, %eax
959; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
960; X86-NEXT:    leal (%eax,%ecx,4), %eax
961; X86-NEXT:    movl %eax, %ecx
962; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
963; X86-NEXT:    shrl %eax
964; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
965; X86-NEXT:    leal (%eax,%ecx,2), %eax
966; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
967; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
968; X86-NEXT:    bswapl %eax
969; X86-NEXT:    movl %eax, %ecx
970; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
971; X86-NEXT:    shll $4, %ecx
972; X86-NEXT:    shrl $4, %eax
973; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
974; X86-NEXT:    orl %ecx, %eax
975; X86-NEXT:    movl %eax, %ecx
976; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
977; X86-NEXT:    shrl $2, %eax
978; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
979; X86-NEXT:    leal (%eax,%ecx,4), %eax
980; X86-NEXT:    movl %eax, %ecx
981; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
982; X86-NEXT:    shrl %eax
983; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
984; X86-NEXT:    leal (%eax,%ecx,2), %eax
985; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
986; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
987; X86-NEXT:    bswapl %eax
988; X86-NEXT:    movl %eax, %ecx
989; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
990; X86-NEXT:    shll $4, %ecx
991; X86-NEXT:    shrl $4, %eax
992; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
993; X86-NEXT:    orl %ecx, %eax
994; X86-NEXT:    movl %eax, %ecx
995; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
996; X86-NEXT:    shrl $2, %eax
997; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
998; X86-NEXT:    leal (%eax,%ecx,4), %eax
999; X86-NEXT:    movl %eax, %ecx
1000; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
1001; X86-NEXT:    shrl %eax
1002; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
1003; X86-NEXT:    leal (%eax,%ecx,2), %eax
1004; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1005; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1006; X86-NEXT:    bswapl %eax
1007; X86-NEXT:    movl %eax, %ecx
1008; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
1009; X86-NEXT:    shll $4, %ecx
1010; X86-NEXT:    shrl $4, %eax
1011; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
1012; X86-NEXT:    orl %ecx, %eax
1013; X86-NEXT:    movl %eax, %ecx
1014; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
1015; X86-NEXT:    shrl $2, %eax
1016; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
1017; X86-NEXT:    leal (%eax,%ecx,4), %eax
1018; X86-NEXT:    movl %eax, %ecx
1019; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
1020; X86-NEXT:    shrl %eax
1021; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
1022; X86-NEXT:    leal (%eax,%ecx,2), %edx
1023; X86-NEXT:    movl %ebp, %esi
1024; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1025; X86-NEXT:    shrdl $16, %ecx, %esi
1026; X86-NEXT:    movl %ebx, %eax
1027; X86-NEXT:    shrdl $16, %ebx, %ecx
1028; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1029; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1030; X86-NEXT:    shrdl $16, %ecx, %eax
1031; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1032; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
1033; X86-NEXT:    shrdl $16, %eax, %ecx
1034; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1035; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1036; X86-NEXT:    shrdl $16, %ecx, %eax
1037; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1038; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
1039; X86-NEXT:    shrdl $16, %eax, %ecx
1040; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1041; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1042; X86-NEXT:    shrdl $16, %ecx, %eax
1043; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1044; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
1045; X86-NEXT:    shrdl $16, %eax, %ecx
1046; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1047; X86-NEXT:    shrdl $16, %edi, %eax
1048; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1049; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
1050; X86-NEXT:    shrdl $16, %eax, %edi
1051; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1052; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
1053; X86-NEXT:    shrdl $16, %ecx, %eax
1054; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1055; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
1056; X86-NEXT:    shrdl $16, %ebp, %ecx
1057; X86-NEXT:    movl %ecx, (%esp) # 4-byte Spill
1058; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
1059; X86-NEXT:    shrdl $16, %ebx, %ebp
1060; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
1061; X86-NEXT:    shrdl $16, %edi, %ebx
1062; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1063; X86-NEXT:    shrdl $16, %ecx, %edi
1064; X86-NEXT:    shrdl $16, %edx, %ecx
1065; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1066; X86-NEXT:    movl %ecx, 60(%eax)
1067; X86-NEXT:    movl %edi, 56(%eax)
1068; X86-NEXT:    movl %ebx, 52(%eax)
1069; X86-NEXT:    movl %ebp, 48(%eax)
1070; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
1071; X86-NEXT:    movl %ecx, 44(%eax)
1072; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1073; X86-NEXT:    movl %ecx, 40(%eax)
1074; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1075; X86-NEXT:    movl %ecx, 36(%eax)
1076; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1077; X86-NEXT:    movl %ecx, 32(%eax)
1078; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1079; X86-NEXT:    movl %ecx, 28(%eax)
1080; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1081; X86-NEXT:    movl %ecx, 24(%eax)
1082; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1083; X86-NEXT:    movl %ecx, 20(%eax)
1084; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1085; X86-NEXT:    movl %ecx, 16(%eax)
1086; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1087; X86-NEXT:    movl %ecx, 12(%eax)
1088; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1089; X86-NEXT:    movl %ecx, 8(%eax)
1090; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1091; X86-NEXT:    movl %ecx, 4(%eax)
1092; X86-NEXT:    movl %esi, (%eax)
1093; X86-NEXT:    shrl $16, %edx
1094; X86-NEXT:    movw %dx, 64(%eax)
1095; X86-NEXT:    addl $60, %esp
1096; X86-NEXT:    popl %esi
1097; X86-NEXT:    popl %edi
1098; X86-NEXT:    popl %ebx
1099; X86-NEXT:    popl %ebp
1100; X86-NEXT:    retl $4
1101;
1102; X64-LABEL: large_promotion:
1103; X64:       # %bb.0:
1104; X64-NEXT:    pushq %r15
1105; X64-NEXT:    pushq %r14
1106; X64-NEXT:    pushq %r13
1107; X64-NEXT:    pushq %r12
1108; X64-NEXT:    pushq %rbx
1109; X64-NEXT:    movq %rdi, %rax
1110; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r12
1111; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r15
1112; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
1113; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
1114; X64-NEXT:    bswapq %rdi
1115; X64-NEXT:    movq %rdi, %r10
1116; X64-NEXT:    shrq $4, %r10
1117; X64-NEXT:    movabsq $1085102592571150095, %r11 # imm = 0xF0F0F0F0F0F0F0F
1118; X64-NEXT:    andq %r11, %r10
1119; X64-NEXT:    andq %r11, %rdi
1120; X64-NEXT:    shlq $4, %rdi
1121; X64-NEXT:    orq %r10, %rdi
1122; X64-NEXT:    movabsq $3689348814741910323, %r10 # imm = 0x3333333333333333
1123; X64-NEXT:    movq %rdi, %r14
1124; X64-NEXT:    andq %r10, %r14
1125; X64-NEXT:    shrq $2, %rdi
1126; X64-NEXT:    andq %r10, %rdi
1127; X64-NEXT:    leaq (%rdi,%r14,4), %rdi
1128; X64-NEXT:    movabsq $6148820866244280320, %r14 # imm = 0x5555000000000000
1129; X64-NEXT:    movq %rdi, %r13
1130; X64-NEXT:    andq %r14, %r13
1131; X64-NEXT:    shrq %rdi
1132; X64-NEXT:    andq %r14, %rdi
1133; X64-NEXT:    leaq (%rdi,%r13,2), %rdi
1134; X64-NEXT:    bswapq %rbx
1135; X64-NEXT:    movq %rbx, %r14
1136; X64-NEXT:    shrq $4, %r14
1137; X64-NEXT:    andq %r11, %r14
1138; X64-NEXT:    andq %r11, %rbx
1139; X64-NEXT:    shlq $4, %rbx
1140; X64-NEXT:    orq %r14, %rbx
1141; X64-NEXT:    movq %rbx, %r14
1142; X64-NEXT:    andq %r10, %r14
1143; X64-NEXT:    shrq $2, %rbx
1144; X64-NEXT:    andq %r10, %rbx
1145; X64-NEXT:    leaq (%rbx,%r14,4), %rbx
1146; X64-NEXT:    movabsq $6148914691236517205, %r14 # imm = 0x5555555555555555
1147; X64-NEXT:    movq %rbx, %r13
1148; X64-NEXT:    andq %r14, %r13
1149; X64-NEXT:    shrq %rbx
1150; X64-NEXT:    andq %r14, %rbx
1151; X64-NEXT:    leaq (%rbx,%r13,2), %rbx
1152; X64-NEXT:    shrdq $48, %rbx, %rdi
1153; X64-NEXT:    bswapq %r15
1154; X64-NEXT:    movq %r15, %r13
1155; X64-NEXT:    shrq $4, %r13
1156; X64-NEXT:    andq %r11, %r13
1157; X64-NEXT:    andq %r11, %r15
1158; X64-NEXT:    shlq $4, %r15
1159; X64-NEXT:    orq %r13, %r15
1160; X64-NEXT:    movq %r15, %r13
1161; X64-NEXT:    andq %r10, %r13
1162; X64-NEXT:    shrq $2, %r15
1163; X64-NEXT:    andq %r10, %r15
1164; X64-NEXT:    leaq (%r15,%r13,4), %r15
1165; X64-NEXT:    movq %r15, %r13
1166; X64-NEXT:    andq %r14, %r13
1167; X64-NEXT:    shrq %r15
1168; X64-NEXT:    andq %r14, %r15
1169; X64-NEXT:    leaq (%r15,%r13,2), %r15
1170; X64-NEXT:    shrdq $48, %r15, %rbx
1171; X64-NEXT:    bswapq %r12
1172; X64-NEXT:    movq %r12, %r13
1173; X64-NEXT:    shrq $4, %r13
1174; X64-NEXT:    andq %r11, %r13
1175; X64-NEXT:    andq %r11, %r12
1176; X64-NEXT:    shlq $4, %r12
1177; X64-NEXT:    orq %r13, %r12
1178; X64-NEXT:    movq %r12, %r13
1179; X64-NEXT:    andq %r10, %r13
1180; X64-NEXT:    shrq $2, %r12
1181; X64-NEXT:    andq %r10, %r12
1182; X64-NEXT:    leaq (%r12,%r13,4), %r12
1183; X64-NEXT:    movq %r12, %r13
1184; X64-NEXT:    andq %r14, %r13
1185; X64-NEXT:    shrq %r12
1186; X64-NEXT:    andq %r14, %r12
1187; X64-NEXT:    leaq (%r12,%r13,2), %r12
1188; X64-NEXT:    shrdq $48, %r12, %r15
1189; X64-NEXT:    bswapq %r9
1190; X64-NEXT:    movq %r9, %r13
1191; X64-NEXT:    shrq $4, %r13
1192; X64-NEXT:    andq %r11, %r13
1193; X64-NEXT:    andq %r11, %r9
1194; X64-NEXT:    shlq $4, %r9
1195; X64-NEXT:    orq %r13, %r9
1196; X64-NEXT:    movq %r9, %r13
1197; X64-NEXT:    andq %r10, %r13
1198; X64-NEXT:    shrq $2, %r9
1199; X64-NEXT:    andq %r10, %r9
1200; X64-NEXT:    leaq (%r9,%r13,4), %r9
1201; X64-NEXT:    movq %r9, %r13
1202; X64-NEXT:    andq %r14, %r13
1203; X64-NEXT:    shrq %r9
1204; X64-NEXT:    andq %r14, %r9
1205; X64-NEXT:    leaq (%r9,%r13,2), %r9
1206; X64-NEXT:    shrdq $48, %r9, %r12
1207; X64-NEXT:    bswapq %r8
1208; X64-NEXT:    movq %r8, %r13
1209; X64-NEXT:    shrq $4, %r13
1210; X64-NEXT:    andq %r11, %r13
1211; X64-NEXT:    andq %r11, %r8
1212; X64-NEXT:    shlq $4, %r8
1213; X64-NEXT:    orq %r13, %r8
1214; X64-NEXT:    movq %r8, %r13
1215; X64-NEXT:    andq %r10, %r13
1216; X64-NEXT:    shrq $2, %r8
1217; X64-NEXT:    andq %r10, %r8
1218; X64-NEXT:    leaq (%r8,%r13,4), %r8
1219; X64-NEXT:    movq %r8, %r13
1220; X64-NEXT:    andq %r14, %r13
1221; X64-NEXT:    shrq %r8
1222; X64-NEXT:    andq %r14, %r8
1223; X64-NEXT:    leaq (%r8,%r13,2), %r8
1224; X64-NEXT:    shrdq $48, %r8, %r9
1225; X64-NEXT:    bswapq %rcx
1226; X64-NEXT:    movq %rcx, %r13
1227; X64-NEXT:    shrq $4, %r13
1228; X64-NEXT:    andq %r11, %r13
1229; X64-NEXT:    andq %r11, %rcx
1230; X64-NEXT:    shlq $4, %rcx
1231; X64-NEXT:    orq %r13, %rcx
1232; X64-NEXT:    movq %rcx, %r13
1233; X64-NEXT:    andq %r10, %r13
1234; X64-NEXT:    shrq $2, %rcx
1235; X64-NEXT:    andq %r10, %rcx
1236; X64-NEXT:    leaq (%rcx,%r13,4), %rcx
1237; X64-NEXT:    movq %rcx, %r13
1238; X64-NEXT:    andq %r14, %r13
1239; X64-NEXT:    shrq %rcx
1240; X64-NEXT:    andq %r14, %rcx
1241; X64-NEXT:    leaq (%rcx,%r13,2), %rcx
1242; X64-NEXT:    shrdq $48, %rcx, %r8
1243; X64-NEXT:    bswapq %rdx
1244; X64-NEXT:    movq %rdx, %r13
1245; X64-NEXT:    shrq $4, %r13
1246; X64-NEXT:    andq %r11, %r13
1247; X64-NEXT:    andq %r11, %rdx
1248; X64-NEXT:    shlq $4, %rdx
1249; X64-NEXT:    orq %r13, %rdx
1250; X64-NEXT:    movq %rdx, %r13
1251; X64-NEXT:    andq %r10, %r13
1252; X64-NEXT:    shrq $2, %rdx
1253; X64-NEXT:    andq %r10, %rdx
1254; X64-NEXT:    leaq (%rdx,%r13,4), %rdx
1255; X64-NEXT:    movq %rdx, %r13
1256; X64-NEXT:    andq %r14, %r13
1257; X64-NEXT:    shrq %rdx
1258; X64-NEXT:    andq %r14, %rdx
1259; X64-NEXT:    leaq (%rdx,%r13,2), %rdx
1260; X64-NEXT:    shrdq $48, %rdx, %rcx
1261; X64-NEXT:    bswapq %rsi
1262; X64-NEXT:    movq %rsi, %r13
1263; X64-NEXT:    shrq $4, %r13
1264; X64-NEXT:    andq %r11, %r13
1265; X64-NEXT:    andq %r11, %rsi
1266; X64-NEXT:    shlq $4, %rsi
1267; X64-NEXT:    orq %r13, %rsi
1268; X64-NEXT:    movq %rsi, %r11
1269; X64-NEXT:    andq %r10, %r11
1270; X64-NEXT:    shrq $2, %rsi
1271; X64-NEXT:    andq %r10, %rsi
1272; X64-NEXT:    leaq (%rsi,%r11,4), %rsi
1273; X64-NEXT:    movq %rsi, %r10
1274; X64-NEXT:    andq %r14, %r10
1275; X64-NEXT:    shrq %rsi
1276; X64-NEXT:    andq %r14, %rsi
1277; X64-NEXT:    leaq (%rsi,%r10,2), %rsi
1278; X64-NEXT:    shrdq $48, %rsi, %rdx
1279; X64-NEXT:    shrq $48, %rsi
1280; X64-NEXT:    movq %rdx, 56(%rax)
1281; X64-NEXT:    movq %rcx, 48(%rax)
1282; X64-NEXT:    movq %r8, 40(%rax)
1283; X64-NEXT:    movq %r9, 32(%rax)
1284; X64-NEXT:    movq %r12, 24(%rax)
1285; X64-NEXT:    movq %r15, 16(%rax)
1286; X64-NEXT:    movq %rbx, 8(%rax)
1287; X64-NEXT:    movq %rdi, (%rax)
1288; X64-NEXT:    movw %si, 64(%rax)
1289; X64-NEXT:    popq %rbx
1290; X64-NEXT:    popq %r12
1291; X64-NEXT:    popq %r13
1292; X64-NEXT:    popq %r14
1293; X64-NEXT:    popq %r15
1294; X64-NEXT:    retq
1295;
1296; X86XOP-LABEL: large_promotion:
1297; X86XOP:       # %bb.0:
1298; X86XOP-NEXT:    pushl %ebp
1299; X86XOP-NEXT:    pushl %ebx
1300; X86XOP-NEXT:    pushl %edi
1301; X86XOP-NEXT:    pushl %esi
1302; X86XOP-NEXT:    subl $44, %esp
1303; X86XOP-NEXT:    vmovdqa {{.*#+}} xmm0 = [87,86,85,84,83,82,81,80,95,94,93,92,91,90,89,88]
1304; X86XOP-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1305; X86XOP-NEXT:    vpperm %xmm0, %xmm1, %xmm0, %xmm1
1306; X86XOP-NEXT:    vpextrd $1, %xmm1, %eax
1307; X86XOP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
1308; X86XOP-NEXT:    vpperm %xmm0, %xmm1, %xmm0, %xmm1
1309; X86XOP-NEXT:    vmovd %xmm1, %ecx
1310; X86XOP-NEXT:    shrdl $16, %ecx, %eax
1311; X86XOP-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1312; X86XOP-NEXT:    vpextrd $1, %xmm1, %eax
1313; X86XOP-NEXT:    shrdl $16, %eax, %ecx
1314; X86XOP-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1315; X86XOP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
1316; X86XOP-NEXT:    vpperm %xmm0, %xmm1, %xmm0, %xmm1
1317; X86XOP-NEXT:    vmovd %xmm1, %ecx
1318; X86XOP-NEXT:    shrdl $16, %ecx, %eax
1319; X86XOP-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1320; X86XOP-NEXT:    vpextrd $1, %xmm1, %eax
1321; X86XOP-NEXT:    shrdl $16, %eax, %ecx
1322; X86XOP-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1323; X86XOP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
1324; X86XOP-NEXT:    vpperm %xmm0, %xmm1, %xmm0, %xmm1
1325; X86XOP-NEXT:    vmovd %xmm1, %ecx
1326; X86XOP-NEXT:    shrdl $16, %ecx, %eax
1327; X86XOP-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1328; X86XOP-NEXT:    vpextrd $1, %xmm1, %eax
1329; X86XOP-NEXT:    shrdl $16, %eax, %ecx
1330; X86XOP-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1331; X86XOP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
1332; X86XOP-NEXT:    vpperm %xmm0, %xmm1, %xmm0, %xmm1
1333; X86XOP-NEXT:    vmovd %xmm1, %ecx
1334; X86XOP-NEXT:    shrdl $16, %ecx, %eax
1335; X86XOP-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1336; X86XOP-NEXT:    vpextrd $1, %xmm1, %eax
1337; X86XOP-NEXT:    shrdl $16, %eax, %ecx
1338; X86XOP-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1339; X86XOP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
1340; X86XOP-NEXT:    vpperm %xmm0, %xmm1, %xmm0, %xmm1
1341; X86XOP-NEXT:    vmovd %xmm1, %ecx
1342; X86XOP-NEXT:    shrdl $16, %ecx, %eax
1343; X86XOP-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1344; X86XOP-NEXT:    vpextrd $1, %xmm1, %eax
1345; X86XOP-NEXT:    shrdl $16, %eax, %ecx
1346; X86XOP-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1347; X86XOP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
1348; X86XOP-NEXT:    vpperm %xmm0, %xmm1, %xmm0, %xmm1
1349; X86XOP-NEXT:    vmovd %xmm1, %ebp
1350; X86XOP-NEXT:    shrdl $16, %ebp, %eax
1351; X86XOP-NEXT:    movl %eax, (%esp) # 4-byte Spill
1352; X86XOP-NEXT:    vpextrd $1, %xmm1, %ebx
1353; X86XOP-NEXT:    shrdl $16, %ebx, %ebp
1354; X86XOP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
1355; X86XOP-NEXT:    vpperm %xmm0, %xmm1, %xmm0, %xmm1
1356; X86XOP-NEXT:    vmovd %xmm1, %esi
1357; X86XOP-NEXT:    shrdl $16, %esi, %ebx
1358; X86XOP-NEXT:    vpextrd $1, %xmm1, %edx
1359; X86XOP-NEXT:    shrdl $16, %edx, %esi
1360; X86XOP-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
1361; X86XOP-NEXT:    vpperm %xmm0, %xmm1, %xmm0, %xmm0
1362; X86XOP-NEXT:    vmovd %xmm0, %ecx
1363; X86XOP-NEXT:    shrdl $16, %ecx, %edx
1364; X86XOP-NEXT:    vpextrd $1, %xmm0, %edi
1365; X86XOP-NEXT:    shrdl $16, %edi, %ecx
1366; X86XOP-NEXT:    movl {{[0-9]+}}(%esp), %eax
1367; X86XOP-NEXT:    movl %ecx, 60(%eax)
1368; X86XOP-NEXT:    movl %edx, 56(%eax)
1369; X86XOP-NEXT:    movl %esi, 52(%eax)
1370; X86XOP-NEXT:    movl %ebx, 48(%eax)
1371; X86XOP-NEXT:    movl %ebp, 44(%eax)
1372; X86XOP-NEXT:    movl (%esp), %ecx # 4-byte Reload
1373; X86XOP-NEXT:    movl %ecx, 40(%eax)
1374; X86XOP-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1375; X86XOP-NEXT:    movl %ecx, 36(%eax)
1376; X86XOP-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1377; X86XOP-NEXT:    movl %ecx, 32(%eax)
1378; X86XOP-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1379; X86XOP-NEXT:    movl %ecx, 28(%eax)
1380; X86XOP-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1381; X86XOP-NEXT:    movl %ecx, 24(%eax)
1382; X86XOP-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1383; X86XOP-NEXT:    movl %ecx, 20(%eax)
1384; X86XOP-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1385; X86XOP-NEXT:    movl %ecx, 16(%eax)
1386; X86XOP-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1387; X86XOP-NEXT:    movl %ecx, 12(%eax)
1388; X86XOP-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1389; X86XOP-NEXT:    movl %ecx, 8(%eax)
1390; X86XOP-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1391; X86XOP-NEXT:    movl %ecx, 4(%eax)
1392; X86XOP-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1393; X86XOP-NEXT:    movl %ecx, (%eax)
1394; X86XOP-NEXT:    shrl $16, %edi
1395; X86XOP-NEXT:    movw %di, 64(%eax)
1396; X86XOP-NEXT:    addl $44, %esp
1397; X86XOP-NEXT:    popl %esi
1398; X86XOP-NEXT:    popl %edi
1399; X86XOP-NEXT:    popl %ebx
1400; X86XOP-NEXT:    popl %ebp
1401; X86XOP-NEXT:    retl $4
1402;
1403; X86GFNI-LABEL: large_promotion:
1404; X86GFNI:       # %bb.0:
1405; X86GFNI-NEXT:    pushl %ebp
1406; X86GFNI-NEXT:    pushl %ebx
1407; X86GFNI-NEXT:    pushl %edi
1408; X86GFNI-NEXT:    pushl %esi
1409; X86GFNI-NEXT:    subl $44, %esp
1410; X86GFNI-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
1411; X86GFNI-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
1412; X86GFNI-NEXT:    vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
1413; X86GFNI-NEXT:    vmovd %xmm1, %eax
1414; X86GFNI-NEXT:    bswapl %eax
1415; X86GFNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
1416; X86GFNI-NEXT:    vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
1417; X86GFNI-NEXT:    vpextrd $1, %xmm1, %ecx
1418; X86GFNI-NEXT:    bswapl %ecx
1419; X86GFNI-NEXT:    shrdl $16, %ecx, %eax
1420; X86GFNI-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1421; X86GFNI-NEXT:    vmovd %xmm1, %eax
1422; X86GFNI-NEXT:    bswapl %eax
1423; X86GFNI-NEXT:    shrdl $16, %eax, %ecx
1424; X86GFNI-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1425; X86GFNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
1426; X86GFNI-NEXT:    vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
1427; X86GFNI-NEXT:    vpextrd $1, %xmm1, %ecx
1428; X86GFNI-NEXT:    bswapl %ecx
1429; X86GFNI-NEXT:    shrdl $16, %ecx, %eax
1430; X86GFNI-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1431; X86GFNI-NEXT:    vmovd %xmm1, %eax
1432; X86GFNI-NEXT:    bswapl %eax
1433; X86GFNI-NEXT:    shrdl $16, %eax, %ecx
1434; X86GFNI-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1435; X86GFNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
1436; X86GFNI-NEXT:    vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
1437; X86GFNI-NEXT:    vpextrd $1, %xmm1, %ecx
1438; X86GFNI-NEXT:    bswapl %ecx
1439; X86GFNI-NEXT:    shrdl $16, %ecx, %eax
1440; X86GFNI-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1441; X86GFNI-NEXT:    vmovd %xmm1, %eax
1442; X86GFNI-NEXT:    bswapl %eax
1443; X86GFNI-NEXT:    shrdl $16, %eax, %ecx
1444; X86GFNI-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1445; X86GFNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
1446; X86GFNI-NEXT:    vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
1447; X86GFNI-NEXT:    vpextrd $1, %xmm1, %ecx
1448; X86GFNI-NEXT:    bswapl %ecx
1449; X86GFNI-NEXT:    shrdl $16, %ecx, %eax
1450; X86GFNI-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1451; X86GFNI-NEXT:    vmovd %xmm1, %eax
1452; X86GFNI-NEXT:    bswapl %eax
1453; X86GFNI-NEXT:    shrdl $16, %eax, %ecx
1454; X86GFNI-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1455; X86GFNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
1456; X86GFNI-NEXT:    vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
1457; X86GFNI-NEXT:    vpextrd $1, %xmm1, %ecx
1458; X86GFNI-NEXT:    bswapl %ecx
1459; X86GFNI-NEXT:    shrdl $16, %ecx, %eax
1460; X86GFNI-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1461; X86GFNI-NEXT:    vmovd %xmm1, %eax
1462; X86GFNI-NEXT:    bswapl %eax
1463; X86GFNI-NEXT:    shrdl $16, %eax, %ecx
1464; X86GFNI-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
1465; X86GFNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
1466; X86GFNI-NEXT:    vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
1467; X86GFNI-NEXT:    vpextrd $1, %xmm1, %ebp
1468; X86GFNI-NEXT:    bswapl %ebp
1469; X86GFNI-NEXT:    shrdl $16, %ebp, %eax
1470; X86GFNI-NEXT:    movl %eax, (%esp) # 4-byte Spill
1471; X86GFNI-NEXT:    vmovd %xmm1, %ebx
1472; X86GFNI-NEXT:    bswapl %ebx
1473; X86GFNI-NEXT:    shrdl $16, %ebx, %ebp
1474; X86GFNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
1475; X86GFNI-NEXT:    vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
1476; X86GFNI-NEXT:    vpextrd $1, %xmm1, %edi
1477; X86GFNI-NEXT:    bswapl %edi
1478; X86GFNI-NEXT:    shrdl $16, %edi, %ebx
1479; X86GFNI-NEXT:    vmovd %xmm1, %edx
1480; X86GFNI-NEXT:    bswapl %edx
1481; X86GFNI-NEXT:    shrdl $16, %edx, %edi
1482; X86GFNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
1483; X86GFNI-NEXT:    vgf2p8affineqb $0, %xmm0, %xmm1, %xmm0
1484; X86GFNI-NEXT:    vpextrd $1, %xmm0, %ecx
1485; X86GFNI-NEXT:    bswapl %ecx
1486; X86GFNI-NEXT:    shrdl $16, %ecx, %edx
1487; X86GFNI-NEXT:    vmovd %xmm0, %esi
1488; X86GFNI-NEXT:    bswapl %esi
1489; X86GFNI-NEXT:    shrdl $16, %esi, %ecx
1490; X86GFNI-NEXT:    movl {{[0-9]+}}(%esp), %eax
1491; X86GFNI-NEXT:    movl %ecx, 60(%eax)
1492; X86GFNI-NEXT:    movl %edx, 56(%eax)
1493; X86GFNI-NEXT:    movl %edi, 52(%eax)
1494; X86GFNI-NEXT:    movl %ebx, 48(%eax)
1495; X86GFNI-NEXT:    movl %ebp, 44(%eax)
1496; X86GFNI-NEXT:    movl (%esp), %ecx # 4-byte Reload
1497; X86GFNI-NEXT:    movl %ecx, 40(%eax)
1498; X86GFNI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1499; X86GFNI-NEXT:    movl %ecx, 36(%eax)
1500; X86GFNI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1501; X86GFNI-NEXT:    movl %ecx, 32(%eax)
1502; X86GFNI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1503; X86GFNI-NEXT:    movl %ecx, 28(%eax)
1504; X86GFNI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1505; X86GFNI-NEXT:    movl %ecx, 24(%eax)
1506; X86GFNI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1507; X86GFNI-NEXT:    movl %ecx, 20(%eax)
1508; X86GFNI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1509; X86GFNI-NEXT:    movl %ecx, 16(%eax)
1510; X86GFNI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1511; X86GFNI-NEXT:    movl %ecx, 12(%eax)
1512; X86GFNI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1513; X86GFNI-NEXT:    movl %ecx, 8(%eax)
1514; X86GFNI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1515; X86GFNI-NEXT:    movl %ecx, 4(%eax)
1516; X86GFNI-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
1517; X86GFNI-NEXT:    movl %ecx, (%eax)
1518; X86GFNI-NEXT:    shrl $16, %esi
1519; X86GFNI-NEXT:    movw %si, 64(%eax)
1520; X86GFNI-NEXT:    addl $44, %esp
1521; X86GFNI-NEXT:    popl %esi
1522; X86GFNI-NEXT:    popl %edi
1523; X86GFNI-NEXT:    popl %ebx
1524; X86GFNI-NEXT:    popl %ebp
1525; X86GFNI-NEXT:    retl $4
1526;
1527; X64GFNI-LABEL: large_promotion:
1528; X64GFNI:       # %bb.0:
1529; X64GFNI-NEXT:    pushq %r14
1530; X64GFNI-NEXT:    pushq %rbx
1531; X64GFNI-NEXT:    movq %rdi, %rax
1532; X64GFNI-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
1533; X64GFNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
1534; X64GFNI-NEXT:    vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
1535; X64GFNI-NEXT:    vmovq %xmm1, %r10
1536; X64GFNI-NEXT:    bswapq %r10
1537; X64GFNI-NEXT:    vmovq %r9, %xmm1
1538; X64GFNI-NEXT:    vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
1539; X64GFNI-NEXT:    vmovq %xmm1, %rdi
1540; X64GFNI-NEXT:    bswapq %rdi
1541; X64GFNI-NEXT:    vmovq %r8, %xmm1
1542; X64GFNI-NEXT:    vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
1543; X64GFNI-NEXT:    vmovq %xmm1, %r8
1544; X64GFNI-NEXT:    bswapq %r8
1545; X64GFNI-NEXT:    movq %r8, %r9
1546; X64GFNI-NEXT:    shldq $16, %rdi, %r9
1547; X64GFNI-NEXT:    shldq $16, %r10, %rdi
1548; X64GFNI-NEXT:    vmovq %rcx, %xmm1
1549; X64GFNI-NEXT:    vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
1550; X64GFNI-NEXT:    vmovq %xmm1, %rcx
1551; X64GFNI-NEXT:    bswapq %rcx
1552; X64GFNI-NEXT:    shrdq $48, %rcx, %r8
1553; X64GFNI-NEXT:    vmovq %rdx, %xmm1
1554; X64GFNI-NEXT:    vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
1555; X64GFNI-NEXT:    vmovq %xmm1, %rdx
1556; X64GFNI-NEXT:    bswapq %rdx
1557; X64GFNI-NEXT:    shrdq $48, %rdx, %rcx
1558; X64GFNI-NEXT:    vmovq %rsi, %xmm1
1559; X64GFNI-NEXT:    vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
1560; X64GFNI-NEXT:    vmovq %xmm1, %rsi
1561; X64GFNI-NEXT:    bswapq %rsi
1562; X64GFNI-NEXT:    shrdq $48, %rsi, %rdx
1563; X64GFNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
1564; X64GFNI-NEXT:    vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
1565; X64GFNI-NEXT:    vmovq %xmm1, %r11
1566; X64GFNI-NEXT:    bswapq %r11
1567; X64GFNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
1568; X64GFNI-NEXT:    vgf2p8affineqb $0, %xmm0, %xmm1, %xmm1
1569; X64GFNI-NEXT:    vmovq %xmm1, %rbx
1570; X64GFNI-NEXT:    bswapq %rbx
1571; X64GFNI-NEXT:    shrdq $48, %rbx, %r11
1572; X64GFNI-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
1573; X64GFNI-NEXT:    vgf2p8affineqb $0, %xmm0, %xmm1, %xmm0
1574; X64GFNI-NEXT:    vmovq %xmm0, %r14
1575; X64GFNI-NEXT:    bswapq %r14
1576; X64GFNI-NEXT:    shrdq $48, %r14, %rbx
1577; X64GFNI-NEXT:    shrdq $48, %r10, %r14
1578; X64GFNI-NEXT:    shrq $48, %rsi
1579; X64GFNI-NEXT:    movq %r14, 16(%rax)
1580; X64GFNI-NEXT:    movq %rbx, 8(%rax)
1581; X64GFNI-NEXT:    movq %r11, (%rax)
1582; X64GFNI-NEXT:    movq %rdx, 56(%rax)
1583; X64GFNI-NEXT:    movq %rcx, 48(%rax)
1584; X64GFNI-NEXT:    movq %r8, 40(%rax)
1585; X64GFNI-NEXT:    movq %r9, 32(%rax)
1586; X64GFNI-NEXT:    movq %rdi, 24(%rax)
1587; X64GFNI-NEXT:    movw %si, 64(%rax)
1588; X64GFNI-NEXT:    popq %rbx
1589; X64GFNI-NEXT:    popq %r14
1590; X64GFNI-NEXT:    retq
1591  %Z = call i528 @llvm.bitreverse.i528(i528 %A)
1592  ret i528 %Z
1593}
1594declare i528 @llvm.bitreverse.i528(i528)
1595