xref: /llvm-project/llvm/test/CodeGen/X86/memset-nonzero.ll (revision 834cc88c5d08ca55664b7742590463de813d768f)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse  | FileCheck %s --check-prefix=SSE
3; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2 | FileCheck %s --check-prefix=SSE
4; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2,-slow-unaligned-mem-16 | FileCheck %s --check-prefix=SSE2FAST
5; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx  | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
6; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
7; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512f  -mattr=+prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
8; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512bw -mattr=+prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
9; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512dq -mattr=+prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
10; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512f  -mattr=-prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
11; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512bw -mattr=-prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW
12
13; https://llvm.org/bugs/show_bug.cgi?id=27100
14
15define void @memset_16_nonzero_bytes(ptr %x) {
16; SSE-LABEL: memset_16_nonzero_bytes:
17; SSE:       # %bb.0:
18; SSE-NEXT:    movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
19; SSE-NEXT:    movq %rax, 8(%rdi)
20; SSE-NEXT:    movq %rax, (%rdi)
21; SSE-NEXT:    retq
22;
23; SSE2FAST-LABEL: memset_16_nonzero_bytes:
24; SSE2FAST:       # %bb.0:
25; SSE2FAST-NEXT:    movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
26; SSE2FAST-NEXT:    movups %xmm0, (%rdi)
27; SSE2FAST-NEXT:    retq
28;
29; AVX-LABEL: memset_16_nonzero_bytes:
30; AVX:       # %bb.0:
31; AVX-NEXT:    vbroadcastss {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
32; AVX-NEXT:    vmovups %xmm0, (%rdi)
33; AVX-NEXT:    retq
34  %call = tail call ptr @__memset_chk(ptr %x, i32 42, i64 16, i64 -1)
35  ret void
36}
37
38define void @memset_32_nonzero_bytes(ptr %x) {
39; SSE-LABEL: memset_32_nonzero_bytes:
40; SSE:       # %bb.0:
41; SSE-NEXT:    movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
42; SSE-NEXT:    movq %rax, 24(%rdi)
43; SSE-NEXT:    movq %rax, 16(%rdi)
44; SSE-NEXT:    movq %rax, 8(%rdi)
45; SSE-NEXT:    movq %rax, (%rdi)
46; SSE-NEXT:    retq
47;
48; SSE2FAST-LABEL: memset_32_nonzero_bytes:
49; SSE2FAST:       # %bb.0:
50; SSE2FAST-NEXT:    movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
51; SSE2FAST-NEXT:    movups %xmm0, 16(%rdi)
52; SSE2FAST-NEXT:    movups %xmm0, (%rdi)
53; SSE2FAST-NEXT:    retq
54;
55; AVX-LABEL: memset_32_nonzero_bytes:
56; AVX:       # %bb.0:
57; AVX-NEXT:    vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
58; AVX-NEXT:    vmovups %ymm0, (%rdi)
59; AVX-NEXT:    vzeroupper
60; AVX-NEXT:    retq
61  %call = tail call ptr @__memset_chk(ptr %x, i32 42, i64 32, i64 -1)
62  ret void
63}
64
65define void @memset_64_nonzero_bytes(ptr %x) {
66; SSE-LABEL: memset_64_nonzero_bytes:
67; SSE:       # %bb.0:
68; SSE-NEXT:    movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
69; SSE-NEXT:    movq %rax, 56(%rdi)
70; SSE-NEXT:    movq %rax, 48(%rdi)
71; SSE-NEXT:    movq %rax, 40(%rdi)
72; SSE-NEXT:    movq %rax, 32(%rdi)
73; SSE-NEXT:    movq %rax, 24(%rdi)
74; SSE-NEXT:    movq %rax, 16(%rdi)
75; SSE-NEXT:    movq %rax, 8(%rdi)
76; SSE-NEXT:    movq %rax, (%rdi)
77; SSE-NEXT:    retq
78;
79; SSE2FAST-LABEL: memset_64_nonzero_bytes:
80; SSE2FAST:       # %bb.0:
81; SSE2FAST-NEXT:    movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
82; SSE2FAST-NEXT:    movups %xmm0, 48(%rdi)
83; SSE2FAST-NEXT:    movups %xmm0, 32(%rdi)
84; SSE2FAST-NEXT:    movups %xmm0, 16(%rdi)
85; SSE2FAST-NEXT:    movups %xmm0, (%rdi)
86; SSE2FAST-NEXT:    retq
87;
88; AVX1-LABEL: memset_64_nonzero_bytes:
89; AVX1:       # %bb.0:
90; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
91; AVX1-NEXT:    vmovups %ymm0, 32(%rdi)
92; AVX1-NEXT:    vmovups %ymm0, (%rdi)
93; AVX1-NEXT:    vzeroupper
94; AVX1-NEXT:    retq
95;
96; AVX2-LABEL: memset_64_nonzero_bytes:
97; AVX2:       # %bb.0:
98; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
99; AVX2-NEXT:    vmovups %ymm0, 32(%rdi)
100; AVX2-NEXT:    vmovups %ymm0, (%rdi)
101; AVX2-NEXT:    vzeroupper
102; AVX2-NEXT:    retq
103;
104; AVX512F-LABEL: memset_64_nonzero_bytes:
105; AVX512F:       # %bb.0:
106; AVX512F-NEXT:    vbroadcastss {{.*#+}} zmm0 = [707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378]
107; AVX512F-NEXT:    vmovups %zmm0, (%rdi)
108; AVX512F-NEXT:    vzeroupper
109; AVX512F-NEXT:    retq
110;
111; AVX512BW-LABEL: memset_64_nonzero_bytes:
112; AVX512BW:       # %bb.0:
113; AVX512BW-NEXT:    vbroadcastss {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
114; AVX512BW-NEXT:    vmovups %zmm0, (%rdi)
115; AVX512BW-NEXT:    vzeroupper
116; AVX512BW-NEXT:    retq
117; AVX512NW-NEXT: retq
118  %call = tail call ptr @__memset_chk(ptr %x, i32 42, i64 64, i64 -1)
119  ret void
120}
121
122define void @memset_128_nonzero_bytes(ptr %x) {
123; SSE-LABEL: memset_128_nonzero_bytes:
124; SSE:       # %bb.0:
125; SSE-NEXT:    movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A
126; SSE-NEXT:    movq %rax, 120(%rdi)
127; SSE-NEXT:    movq %rax, 112(%rdi)
128; SSE-NEXT:    movq %rax, 104(%rdi)
129; SSE-NEXT:    movq %rax, 96(%rdi)
130; SSE-NEXT:    movq %rax, 88(%rdi)
131; SSE-NEXT:    movq %rax, 80(%rdi)
132; SSE-NEXT:    movq %rax, 72(%rdi)
133; SSE-NEXT:    movq %rax, 64(%rdi)
134; SSE-NEXT:    movq %rax, 56(%rdi)
135; SSE-NEXT:    movq %rax, 48(%rdi)
136; SSE-NEXT:    movq %rax, 40(%rdi)
137; SSE-NEXT:    movq %rax, 32(%rdi)
138; SSE-NEXT:    movq %rax, 24(%rdi)
139; SSE-NEXT:    movq %rax, 16(%rdi)
140; SSE-NEXT:    movq %rax, 8(%rdi)
141; SSE-NEXT:    movq %rax, (%rdi)
142; SSE-NEXT:    retq
143;
144; SSE2FAST-LABEL: memset_128_nonzero_bytes:
145; SSE2FAST:       # %bb.0:
146; SSE2FAST-NEXT:    movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
147; SSE2FAST-NEXT:    movups %xmm0, 112(%rdi)
148; SSE2FAST-NEXT:    movups %xmm0, 96(%rdi)
149; SSE2FAST-NEXT:    movups %xmm0, 80(%rdi)
150; SSE2FAST-NEXT:    movups %xmm0, 64(%rdi)
151; SSE2FAST-NEXT:    movups %xmm0, 48(%rdi)
152; SSE2FAST-NEXT:    movups %xmm0, 32(%rdi)
153; SSE2FAST-NEXT:    movups %xmm0, 16(%rdi)
154; SSE2FAST-NEXT:    movups %xmm0, (%rdi)
155; SSE2FAST-NEXT:    retq
156;
157; AVX1-LABEL: memset_128_nonzero_bytes:
158; AVX1:       # %bb.0:
159; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
160; AVX1-NEXT:    vmovups %ymm0, 96(%rdi)
161; AVX1-NEXT:    vmovups %ymm0, 64(%rdi)
162; AVX1-NEXT:    vmovups %ymm0, 32(%rdi)
163; AVX1-NEXT:    vmovups %ymm0, (%rdi)
164; AVX1-NEXT:    vzeroupper
165; AVX1-NEXT:    retq
166;
167; AVX2-LABEL: memset_128_nonzero_bytes:
168; AVX2:       # %bb.0:
169; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
170; AVX2-NEXT:    vmovups %ymm0, 96(%rdi)
171; AVX2-NEXT:    vmovups %ymm0, 64(%rdi)
172; AVX2-NEXT:    vmovups %ymm0, 32(%rdi)
173; AVX2-NEXT:    vmovups %ymm0, (%rdi)
174; AVX2-NEXT:    vzeroupper
175; AVX2-NEXT:    retq
176;
177; AVX512F-LABEL: memset_128_nonzero_bytes:
178; AVX512F:       # %bb.0:
179; AVX512F-NEXT:    vbroadcastss {{.*#+}} zmm0 = [707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378]
180; AVX512F-NEXT:    vmovups %zmm0, 64(%rdi)
181; AVX512F-NEXT:    vmovups %zmm0, (%rdi)
182; AVX512F-NEXT:    vzeroupper
183; AVX512F-NEXT:    retq
184;
185; AVX512BW-LABEL: memset_128_nonzero_bytes:
186; AVX512BW:       # %bb.0:
187; AVX512BW-NEXT:    vbroadcastss {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
188; AVX512BW-NEXT:    vmovups %zmm0, 64(%rdi)
189; AVX512BW-NEXT:    vmovups %zmm0, (%rdi)
190; AVX512BW-NEXT:    vzeroupper
191; AVX512BW-NEXT:    retq
192  %call = tail call ptr @__memset_chk(ptr %x, i32 42, i64 128, i64 -1)
193  ret void
194}
195
196define void @memset_256_nonzero_bytes(ptr %x) {
197; SSE-LABEL: memset_256_nonzero_bytes:
198; SSE:       # %bb.0:
199; SSE-NEXT:    movl $256, %edx # imm = 0x100
200; SSE-NEXT:    movl $42, %esi
201; SSE-NEXT:    jmp memset@PLT # TAILCALL
202;
203; SSE2FAST-LABEL: memset_256_nonzero_bytes:
204; SSE2FAST:       # %bb.0:
205; SSE2FAST-NEXT:    movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
206; SSE2FAST-NEXT:    movups %xmm0, 240(%rdi)
207; SSE2FAST-NEXT:    movups %xmm0, 224(%rdi)
208; SSE2FAST-NEXT:    movups %xmm0, 208(%rdi)
209; SSE2FAST-NEXT:    movups %xmm0, 192(%rdi)
210; SSE2FAST-NEXT:    movups %xmm0, 176(%rdi)
211; SSE2FAST-NEXT:    movups %xmm0, 160(%rdi)
212; SSE2FAST-NEXT:    movups %xmm0, 144(%rdi)
213; SSE2FAST-NEXT:    movups %xmm0, 128(%rdi)
214; SSE2FAST-NEXT:    movups %xmm0, 112(%rdi)
215; SSE2FAST-NEXT:    movups %xmm0, 96(%rdi)
216; SSE2FAST-NEXT:    movups %xmm0, 80(%rdi)
217; SSE2FAST-NEXT:    movups %xmm0, 64(%rdi)
218; SSE2FAST-NEXT:    movups %xmm0, 48(%rdi)
219; SSE2FAST-NEXT:    movups %xmm0, 32(%rdi)
220; SSE2FAST-NEXT:    movups %xmm0, 16(%rdi)
221; SSE2FAST-NEXT:    movups %xmm0, (%rdi)
222; SSE2FAST-NEXT:    retq
223;
224; AVX1-LABEL: memset_256_nonzero_bytes:
225; AVX1:       # %bb.0:
226; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
227; AVX1-NEXT:    vmovups %ymm0, 224(%rdi)
228; AVX1-NEXT:    vmovups %ymm0, 192(%rdi)
229; AVX1-NEXT:    vmovups %ymm0, 160(%rdi)
230; AVX1-NEXT:    vmovups %ymm0, 128(%rdi)
231; AVX1-NEXT:    vmovups %ymm0, 96(%rdi)
232; AVX1-NEXT:    vmovups %ymm0, 64(%rdi)
233; AVX1-NEXT:    vmovups %ymm0, 32(%rdi)
234; AVX1-NEXT:    vmovups %ymm0, (%rdi)
235; AVX1-NEXT:    vzeroupper
236; AVX1-NEXT:    retq
237;
238; AVX2-LABEL: memset_256_nonzero_bytes:
239; AVX2:       # %bb.0:
240; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
241; AVX2-NEXT:    vmovups %ymm0, 224(%rdi)
242; AVX2-NEXT:    vmovups %ymm0, 192(%rdi)
243; AVX2-NEXT:    vmovups %ymm0, 160(%rdi)
244; AVX2-NEXT:    vmovups %ymm0, 128(%rdi)
245; AVX2-NEXT:    vmovups %ymm0, 96(%rdi)
246; AVX2-NEXT:    vmovups %ymm0, 64(%rdi)
247; AVX2-NEXT:    vmovups %ymm0, 32(%rdi)
248; AVX2-NEXT:    vmovups %ymm0, (%rdi)
249; AVX2-NEXT:    vzeroupper
250; AVX2-NEXT:    retq
251;
252; AVX512F-LABEL: memset_256_nonzero_bytes:
253; AVX512F:       # %bb.0:
254; AVX512F-NEXT:    vbroadcastss {{.*#+}} zmm0 = [707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378]
255; AVX512F-NEXT:    vmovups %zmm0, 192(%rdi)
256; AVX512F-NEXT:    vmovups %zmm0, 128(%rdi)
257; AVX512F-NEXT:    vmovups %zmm0, 64(%rdi)
258; AVX512F-NEXT:    vmovups %zmm0, (%rdi)
259; AVX512F-NEXT:    vzeroupper
260; AVX512F-NEXT:    retq
261;
262; AVX512BW-LABEL: memset_256_nonzero_bytes:
263; AVX512BW:       # %bb.0:
264; AVX512BW-NEXT:    vbroadcastss {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42]
265; AVX512BW-NEXT:    vmovups %zmm0, 192(%rdi)
266; AVX512BW-NEXT:    vmovups %zmm0, 128(%rdi)
267; AVX512BW-NEXT:    vmovups %zmm0, 64(%rdi)
268; AVX512BW-NEXT:    vmovups %zmm0, (%rdi)
269; AVX512BW-NEXT:    vzeroupper
270; AVX512BW-NEXT:    retq
271  %call = tail call ptr @__memset_chk(ptr %x, i32 42, i64 256, i64 -1)
272  ret void
273}
274
275declare ptr @__memset_chk(ptr, i32, i64, i64)
276
277; Repeat with a non-constant value for the stores.
278
279define void @memset_16_nonconst_bytes(ptr %x, i8 %c) {
280; SSE-LABEL: memset_16_nonconst_bytes:
281; SSE:       # %bb.0:
282; SSE-NEXT:    movzbl %sil, %eax
283; SSE-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
284; SSE-NEXT:    imulq %rax, %rcx
285; SSE-NEXT:    movq %rcx, 8(%rdi)
286; SSE-NEXT:    movq %rcx, (%rdi)
287; SSE-NEXT:    retq
288;
289; SSE2FAST-LABEL: memset_16_nonconst_bytes:
290; SSE2FAST:       # %bb.0:
291; SSE2FAST-NEXT:    movd %esi, %xmm0
292; SSE2FAST-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
293; SSE2FAST-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
294; SSE2FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
295; SSE2FAST-NEXT:    movdqu %xmm0, (%rdi)
296; SSE2FAST-NEXT:    retq
297;
298; AVX1-LABEL: memset_16_nonconst_bytes:
299; AVX1:       # %bb.0:
300; AVX1-NEXT:    vmovd %esi, %xmm0
301; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
302; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
303; AVX1-NEXT:    vmovdqu %xmm0, (%rdi)
304; AVX1-NEXT:    retq
305;
306; AVX2-LABEL: memset_16_nonconst_bytes:
307; AVX2:       # %bb.0:
308; AVX2-NEXT:    vmovd %esi, %xmm0
309; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
310; AVX2-NEXT:    vmovdqu %xmm0, (%rdi)
311; AVX2-NEXT:    retq
312;
313; AVX512-LABEL: memset_16_nonconst_bytes:
314; AVX512:       # %bb.0:
315; AVX512-NEXT:    vmovd %esi, %xmm0
316; AVX512-NEXT:    vpbroadcastb %xmm0, %xmm0
317; AVX512-NEXT:    vmovdqu %xmm0, (%rdi)
318; AVX512-NEXT:    retq
319  tail call void @llvm.memset.p0.i64(ptr %x, i8 %c, i64 16, i1 false)
320  ret void
321}
322
323define void @memset_32_nonconst_bytes(ptr %x, i8 %c) {
324; SSE-LABEL: memset_32_nonconst_bytes:
325; SSE:       # %bb.0:
326; SSE-NEXT:    movzbl %sil, %eax
327; SSE-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
328; SSE-NEXT:    imulq %rax, %rcx
329; SSE-NEXT:    movq %rcx, 24(%rdi)
330; SSE-NEXT:    movq %rcx, 16(%rdi)
331; SSE-NEXT:    movq %rcx, 8(%rdi)
332; SSE-NEXT:    movq %rcx, (%rdi)
333; SSE-NEXT:    retq
334;
335; SSE2FAST-LABEL: memset_32_nonconst_bytes:
336; SSE2FAST:       # %bb.0:
337; SSE2FAST-NEXT:    movd %esi, %xmm0
338; SSE2FAST-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
339; SSE2FAST-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
340; SSE2FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
341; SSE2FAST-NEXT:    movdqu %xmm0, 16(%rdi)
342; SSE2FAST-NEXT:    movdqu %xmm0, (%rdi)
343; SSE2FAST-NEXT:    retq
344;
345; AVX1-LABEL: memset_32_nonconst_bytes:
346; AVX1:       # %bb.0:
347; AVX1-NEXT:    vmovd %esi, %xmm0
348; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
349; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
350; AVX1-NEXT:    vmovdqu %xmm0, 16(%rdi)
351; AVX1-NEXT:    vmovdqu %xmm0, (%rdi)
352; AVX1-NEXT:    retq
353;
354; AVX2-LABEL: memset_32_nonconst_bytes:
355; AVX2:       # %bb.0:
356; AVX2-NEXT:    vmovd %esi, %xmm0
357; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
358; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
359; AVX2-NEXT:    vzeroupper
360; AVX2-NEXT:    retq
361;
362; AVX512-LABEL: memset_32_nonconst_bytes:
363; AVX512:       # %bb.0:
364; AVX512-NEXT:    vmovd %esi, %xmm0
365; AVX512-NEXT:    vpbroadcastb %xmm0, %ymm0
366; AVX512-NEXT:    vmovdqu %ymm0, (%rdi)
367; AVX512-NEXT:    vzeroupper
368; AVX512-NEXT:    retq
369  tail call void @llvm.memset.p0.i64(ptr %x, i8 %c, i64 32, i1 false)
370  ret void
371}
372
373define void @memset_64_nonconst_bytes(ptr %x, i8 %c) {
374; SSE-LABEL: memset_64_nonconst_bytes:
375; SSE:       # %bb.0:
376; SSE-NEXT:    movzbl %sil, %eax
377; SSE-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
378; SSE-NEXT:    imulq %rax, %rcx
379; SSE-NEXT:    movq %rcx, 56(%rdi)
380; SSE-NEXT:    movq %rcx, 48(%rdi)
381; SSE-NEXT:    movq %rcx, 40(%rdi)
382; SSE-NEXT:    movq %rcx, 32(%rdi)
383; SSE-NEXT:    movq %rcx, 24(%rdi)
384; SSE-NEXT:    movq %rcx, 16(%rdi)
385; SSE-NEXT:    movq %rcx, 8(%rdi)
386; SSE-NEXT:    movq %rcx, (%rdi)
387; SSE-NEXT:    retq
388;
389; SSE2FAST-LABEL: memset_64_nonconst_bytes:
390; SSE2FAST:       # %bb.0:
391; SSE2FAST-NEXT:    movd %esi, %xmm0
392; SSE2FAST-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
393; SSE2FAST-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
394; SSE2FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
395; SSE2FAST-NEXT:    movdqu %xmm0, 48(%rdi)
396; SSE2FAST-NEXT:    movdqu %xmm0, 32(%rdi)
397; SSE2FAST-NEXT:    movdqu %xmm0, 16(%rdi)
398; SSE2FAST-NEXT:    movdqu %xmm0, (%rdi)
399; SSE2FAST-NEXT:    retq
400;
401; AVX1-LABEL: memset_64_nonconst_bytes:
402; AVX1:       # %bb.0:
403; AVX1-NEXT:    vmovd %esi, %xmm0
404; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
405; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
406; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
407; AVX1-NEXT:    vmovups %ymm0, 32(%rdi)
408; AVX1-NEXT:    vmovups %ymm0, (%rdi)
409; AVX1-NEXT:    vzeroupper
410; AVX1-NEXT:    retq
411;
412; AVX2-LABEL: memset_64_nonconst_bytes:
413; AVX2:       # %bb.0:
414; AVX2-NEXT:    vmovd %esi, %xmm0
415; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
416; AVX2-NEXT:    vmovdqu %ymm0, 32(%rdi)
417; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
418; AVX2-NEXT:    vzeroupper
419; AVX2-NEXT:    retq
420;
421; AVX512F-LABEL: memset_64_nonconst_bytes:
422; AVX512F:       # %bb.0:
423; AVX512F-NEXT:    movzbl %sil, %eax
424; AVX512F-NEXT:    imull $16843009, %eax, %eax # imm = 0x1010101
425; AVX512F-NEXT:    vpbroadcastd %eax, %zmm0
426; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rdi)
427; AVX512F-NEXT:    vzeroupper
428; AVX512F-NEXT:    retq
429;
430; AVX512BW-LABEL: memset_64_nonconst_bytes:
431; AVX512BW:       # %bb.0:
432; AVX512BW-NEXT:    vpbroadcastb %esi, %zmm0
433; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rdi)
434; AVX512BW-NEXT:    vzeroupper
435; AVX512BW-NEXT:    retq
436  tail call void @llvm.memset.p0.i64(ptr %x, i8 %c, i64 64, i1 false)
437  ret void
438}
439
440define void @memset_128_nonconst_bytes(ptr %x, i8 %c) {
441; SSE-LABEL: memset_128_nonconst_bytes:
442; SSE:       # %bb.0:
443; SSE-NEXT:    movzbl %sil, %eax
444; SSE-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
445; SSE-NEXT:    imulq %rax, %rcx
446; SSE-NEXT:    movq %rcx, 120(%rdi)
447; SSE-NEXT:    movq %rcx, 112(%rdi)
448; SSE-NEXT:    movq %rcx, 104(%rdi)
449; SSE-NEXT:    movq %rcx, 96(%rdi)
450; SSE-NEXT:    movq %rcx, 88(%rdi)
451; SSE-NEXT:    movq %rcx, 80(%rdi)
452; SSE-NEXT:    movq %rcx, 72(%rdi)
453; SSE-NEXT:    movq %rcx, 64(%rdi)
454; SSE-NEXT:    movq %rcx, 56(%rdi)
455; SSE-NEXT:    movq %rcx, 48(%rdi)
456; SSE-NEXT:    movq %rcx, 40(%rdi)
457; SSE-NEXT:    movq %rcx, 32(%rdi)
458; SSE-NEXT:    movq %rcx, 24(%rdi)
459; SSE-NEXT:    movq %rcx, 16(%rdi)
460; SSE-NEXT:    movq %rcx, 8(%rdi)
461; SSE-NEXT:    movq %rcx, (%rdi)
462; SSE-NEXT:    retq
463;
464; SSE2FAST-LABEL: memset_128_nonconst_bytes:
465; SSE2FAST:       # %bb.0:
466; SSE2FAST-NEXT:    movd %esi, %xmm0
467; SSE2FAST-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
468; SSE2FAST-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
469; SSE2FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
470; SSE2FAST-NEXT:    movdqu %xmm0, 112(%rdi)
471; SSE2FAST-NEXT:    movdqu %xmm0, 96(%rdi)
472; SSE2FAST-NEXT:    movdqu %xmm0, 80(%rdi)
473; SSE2FAST-NEXT:    movdqu %xmm0, 64(%rdi)
474; SSE2FAST-NEXT:    movdqu %xmm0, 48(%rdi)
475; SSE2FAST-NEXT:    movdqu %xmm0, 32(%rdi)
476; SSE2FAST-NEXT:    movdqu %xmm0, 16(%rdi)
477; SSE2FAST-NEXT:    movdqu %xmm0, (%rdi)
478; SSE2FAST-NEXT:    retq
479;
480; AVX1-LABEL: memset_128_nonconst_bytes:
481; AVX1:       # %bb.0:
482; AVX1-NEXT:    vmovd %esi, %xmm0
483; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
484; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
485; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
486; AVX1-NEXT:    vmovups %ymm0, 96(%rdi)
487; AVX1-NEXT:    vmovups %ymm0, 64(%rdi)
488; AVX1-NEXT:    vmovups %ymm0, 32(%rdi)
489; AVX1-NEXT:    vmovups %ymm0, (%rdi)
490; AVX1-NEXT:    vzeroupper
491; AVX1-NEXT:    retq
492;
493; AVX2-LABEL: memset_128_nonconst_bytes:
494; AVX2:       # %bb.0:
495; AVX2-NEXT:    vmovd %esi, %xmm0
496; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
497; AVX2-NEXT:    vmovdqu %ymm0, 96(%rdi)
498; AVX2-NEXT:    vmovdqu %ymm0, 64(%rdi)
499; AVX2-NEXT:    vmovdqu %ymm0, 32(%rdi)
500; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
501; AVX2-NEXT:    vzeroupper
502; AVX2-NEXT:    retq
503;
504; AVX512F-LABEL: memset_128_nonconst_bytes:
505; AVX512F:       # %bb.0:
506; AVX512F-NEXT:    movzbl %sil, %eax
507; AVX512F-NEXT:    imull $16843009, %eax, %eax # imm = 0x1010101
508; AVX512F-NEXT:    vpbroadcastd %eax, %zmm0
509; AVX512F-NEXT:    vmovdqu64 %zmm0, 64(%rdi)
510; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rdi)
511; AVX512F-NEXT:    vzeroupper
512; AVX512F-NEXT:    retq
513;
514; AVX512BW-LABEL: memset_128_nonconst_bytes:
515; AVX512BW:       # %bb.0:
516; AVX512BW-NEXT:    vpbroadcastb %esi, %zmm0
517; AVX512BW-NEXT:    vmovdqu64 %zmm0, 64(%rdi)
518; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rdi)
519; AVX512BW-NEXT:    vzeroupper
520; AVX512BW-NEXT:    retq
521  tail call void @llvm.memset.p0.i64(ptr %x, i8 %c, i64 128, i1 false)
522  ret void
523}
524
525define void @memset_256_nonconst_bytes(ptr %x, i8 %c) {
526; SSE-LABEL: memset_256_nonconst_bytes:
527; SSE:       # %bb.0:
528; SSE-NEXT:    movl $256, %edx # imm = 0x100
529; SSE-NEXT:    jmp memset@PLT # TAILCALL
530;
531; SSE2FAST-LABEL: memset_256_nonconst_bytes:
532; SSE2FAST:       # %bb.0:
533; SSE2FAST-NEXT:    movd %esi, %xmm0
534; SSE2FAST-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
535; SSE2FAST-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
536; SSE2FAST-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
537; SSE2FAST-NEXT:    movdqu %xmm0, 240(%rdi)
538; SSE2FAST-NEXT:    movdqu %xmm0, 224(%rdi)
539; SSE2FAST-NEXT:    movdqu %xmm0, 208(%rdi)
540; SSE2FAST-NEXT:    movdqu %xmm0, 192(%rdi)
541; SSE2FAST-NEXT:    movdqu %xmm0, 176(%rdi)
542; SSE2FAST-NEXT:    movdqu %xmm0, 160(%rdi)
543; SSE2FAST-NEXT:    movdqu %xmm0, 144(%rdi)
544; SSE2FAST-NEXT:    movdqu %xmm0, 128(%rdi)
545; SSE2FAST-NEXT:    movdqu %xmm0, 112(%rdi)
546; SSE2FAST-NEXT:    movdqu %xmm0, 96(%rdi)
547; SSE2FAST-NEXT:    movdqu %xmm0, 80(%rdi)
548; SSE2FAST-NEXT:    movdqu %xmm0, 64(%rdi)
549; SSE2FAST-NEXT:    movdqu %xmm0, 48(%rdi)
550; SSE2FAST-NEXT:    movdqu %xmm0, 32(%rdi)
551; SSE2FAST-NEXT:    movdqu %xmm0, 16(%rdi)
552; SSE2FAST-NEXT:    movdqu %xmm0, (%rdi)
553; SSE2FAST-NEXT:    retq
554;
555; AVX1-LABEL: memset_256_nonconst_bytes:
556; AVX1:       # %bb.0:
557; AVX1-NEXT:    vmovd %esi, %xmm0
558; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
559; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
560; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
561; AVX1-NEXT:    vmovups %ymm0, 224(%rdi)
562; AVX1-NEXT:    vmovups %ymm0, 192(%rdi)
563; AVX1-NEXT:    vmovups %ymm0, 160(%rdi)
564; AVX1-NEXT:    vmovups %ymm0, 128(%rdi)
565; AVX1-NEXT:    vmovups %ymm0, 96(%rdi)
566; AVX1-NEXT:    vmovups %ymm0, 64(%rdi)
567; AVX1-NEXT:    vmovups %ymm0, 32(%rdi)
568; AVX1-NEXT:    vmovups %ymm0, (%rdi)
569; AVX1-NEXT:    vzeroupper
570; AVX1-NEXT:    retq
571;
572; AVX2-LABEL: memset_256_nonconst_bytes:
573; AVX2:       # %bb.0:
574; AVX2-NEXT:    vmovd %esi, %xmm0
575; AVX2-NEXT:    vpbroadcastb %xmm0, %ymm0
576; AVX2-NEXT:    vmovdqu %ymm0, 224(%rdi)
577; AVX2-NEXT:    vmovdqu %ymm0, 192(%rdi)
578; AVX2-NEXT:    vmovdqu %ymm0, 160(%rdi)
579; AVX2-NEXT:    vmovdqu %ymm0, 128(%rdi)
580; AVX2-NEXT:    vmovdqu %ymm0, 96(%rdi)
581; AVX2-NEXT:    vmovdqu %ymm0, 64(%rdi)
582; AVX2-NEXT:    vmovdqu %ymm0, 32(%rdi)
583; AVX2-NEXT:    vmovdqu %ymm0, (%rdi)
584; AVX2-NEXT:    vzeroupper
585; AVX2-NEXT:    retq
586;
587; AVX512F-LABEL: memset_256_nonconst_bytes:
588; AVX512F:       # %bb.0:
589; AVX512F-NEXT:    movzbl %sil, %eax
590; AVX512F-NEXT:    imull $16843009, %eax, %eax # imm = 0x1010101
591; AVX512F-NEXT:    vpbroadcastd %eax, %zmm0
592; AVX512F-NEXT:    vmovdqu64 %zmm0, 192(%rdi)
593; AVX512F-NEXT:    vmovdqu64 %zmm0, 128(%rdi)
594; AVX512F-NEXT:    vmovdqu64 %zmm0, 64(%rdi)
595; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rdi)
596; AVX512F-NEXT:    vzeroupper
597; AVX512F-NEXT:    retq
598;
599; AVX512BW-LABEL: memset_256_nonconst_bytes:
600; AVX512BW:       # %bb.0:
601; AVX512BW-NEXT:    vpbroadcastb %esi, %zmm0
602; AVX512BW-NEXT:    vmovdqu64 %zmm0, 192(%rdi)
603; AVX512BW-NEXT:    vmovdqu64 %zmm0, 128(%rdi)
604; AVX512BW-NEXT:    vmovdqu64 %zmm0, 64(%rdi)
605; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rdi)
606; AVX512BW-NEXT:    vzeroupper
607; AVX512BW-NEXT:    retq
608  tail call void @llvm.memset.p0.i64(ptr %x, i8 %c, i64 256, i1 false)
609  ret void
610}
611
612declare void @llvm.memset.p0.i64(ptr nocapture, i8, i64, i1) #1
613
614