1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse | FileCheck %s --check-prefix=SSE 3; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2 | FileCheck %s --check-prefix=SSE 4; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=sse2,-slow-unaligned-mem-16 | FileCheck %s --check-prefix=SSE2FAST 5; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 6; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 7; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512f -mattr=+prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 8; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512bw -mattr=+prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 9; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512dq -mattr=+prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX2 10; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512f -mattr=-prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F 11; RUN: llc -mtriple=x86_64-unknown-unknown < %s -mattr=avx512bw -mattr=-prefer-256-bit | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW 12 13; https://llvm.org/bugs/show_bug.cgi?id=27100 14 15define void @memset_16_nonzero_bytes(ptr %x) { 16; SSE-LABEL: memset_16_nonzero_bytes: 17; SSE: # %bb.0: 18; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A 19; SSE-NEXT: movq %rax, 8(%rdi) 20; SSE-NEXT: movq %rax, (%rdi) 21; SSE-NEXT: retq 22; 23; SSE2FAST-LABEL: memset_16_nonzero_bytes: 24; SSE2FAST: # %bb.0: 25; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 26; SSE2FAST-NEXT: movups %xmm0, (%rdi) 27; SSE2FAST-NEXT: retq 28; 29; AVX-LABEL: memset_16_nonzero_bytes: 30; AVX: # %bb.0: 31; AVX-NEXT: vbroadcastss {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 32; AVX-NEXT: vmovups %xmm0, (%rdi) 33; AVX-NEXT: retq 34 %call = tail call ptr @__memset_chk(ptr %x, i32 42, i64 16, i64 -1) 35 ret void 36} 37 38define void @memset_32_nonzero_bytes(ptr %x) { 39; SSE-LABEL: memset_32_nonzero_bytes: 40; SSE: # %bb.0: 41; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A 42; SSE-NEXT: movq %rax, 24(%rdi) 43; SSE-NEXT: movq %rax, 16(%rdi) 44; SSE-NEXT: movq %rax, 8(%rdi) 45; SSE-NEXT: movq %rax, (%rdi) 46; SSE-NEXT: retq 47; 48; SSE2FAST-LABEL: memset_32_nonzero_bytes: 49; SSE2FAST: # %bb.0: 50; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 51; SSE2FAST-NEXT: movups %xmm0, 16(%rdi) 52; SSE2FAST-NEXT: movups %xmm0, (%rdi) 53; SSE2FAST-NEXT: retq 54; 55; AVX-LABEL: memset_32_nonzero_bytes: 56; AVX: # %bb.0: 57; AVX-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 58; AVX-NEXT: vmovups %ymm0, (%rdi) 59; AVX-NEXT: vzeroupper 60; AVX-NEXT: retq 61 %call = tail call ptr @__memset_chk(ptr %x, i32 42, i64 32, i64 -1) 62 ret void 63} 64 65define void @memset_64_nonzero_bytes(ptr %x) { 66; SSE-LABEL: memset_64_nonzero_bytes: 67; SSE: # %bb.0: 68; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A 69; SSE-NEXT: movq %rax, 56(%rdi) 70; SSE-NEXT: movq %rax, 48(%rdi) 71; SSE-NEXT: movq %rax, 40(%rdi) 72; SSE-NEXT: movq %rax, 32(%rdi) 73; SSE-NEXT: movq %rax, 24(%rdi) 74; SSE-NEXT: movq %rax, 16(%rdi) 75; SSE-NEXT: movq %rax, 8(%rdi) 76; SSE-NEXT: movq %rax, (%rdi) 77; SSE-NEXT: retq 78; 79; SSE2FAST-LABEL: memset_64_nonzero_bytes: 80; SSE2FAST: # %bb.0: 81; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 82; SSE2FAST-NEXT: movups %xmm0, 48(%rdi) 83; SSE2FAST-NEXT: movups %xmm0, 32(%rdi) 84; SSE2FAST-NEXT: movups %xmm0, 16(%rdi) 85; SSE2FAST-NEXT: movups %xmm0, (%rdi) 86; SSE2FAST-NEXT: retq 87; 88; AVX1-LABEL: memset_64_nonzero_bytes: 89; AVX1: # %bb.0: 90; AVX1-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 91; AVX1-NEXT: vmovups %ymm0, 32(%rdi) 92; AVX1-NEXT: vmovups %ymm0, (%rdi) 93; AVX1-NEXT: vzeroupper 94; AVX1-NEXT: retq 95; 96; AVX2-LABEL: memset_64_nonzero_bytes: 97; AVX2: # %bb.0: 98; AVX2-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 99; AVX2-NEXT: vmovups %ymm0, 32(%rdi) 100; AVX2-NEXT: vmovups %ymm0, (%rdi) 101; AVX2-NEXT: vzeroupper 102; AVX2-NEXT: retq 103; 104; AVX512F-LABEL: memset_64_nonzero_bytes: 105; AVX512F: # %bb.0: 106; AVX512F-NEXT: vbroadcastss {{.*#+}} zmm0 = [707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378] 107; AVX512F-NEXT: vmovups %zmm0, (%rdi) 108; AVX512F-NEXT: vzeroupper 109; AVX512F-NEXT: retq 110; 111; AVX512BW-LABEL: memset_64_nonzero_bytes: 112; AVX512BW: # %bb.0: 113; AVX512BW-NEXT: vbroadcastss {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 114; AVX512BW-NEXT: vmovups %zmm0, (%rdi) 115; AVX512BW-NEXT: vzeroupper 116; AVX512BW-NEXT: retq 117; AVX512NW-NEXT: retq 118 %call = tail call ptr @__memset_chk(ptr %x, i32 42, i64 64, i64 -1) 119 ret void 120} 121 122define void @memset_128_nonzero_bytes(ptr %x) { 123; SSE-LABEL: memset_128_nonzero_bytes: 124; SSE: # %bb.0: 125; SSE-NEXT: movabsq $3038287259199220266, %rax # imm = 0x2A2A2A2A2A2A2A2A 126; SSE-NEXT: movq %rax, 120(%rdi) 127; SSE-NEXT: movq %rax, 112(%rdi) 128; SSE-NEXT: movq %rax, 104(%rdi) 129; SSE-NEXT: movq %rax, 96(%rdi) 130; SSE-NEXT: movq %rax, 88(%rdi) 131; SSE-NEXT: movq %rax, 80(%rdi) 132; SSE-NEXT: movq %rax, 72(%rdi) 133; SSE-NEXT: movq %rax, 64(%rdi) 134; SSE-NEXT: movq %rax, 56(%rdi) 135; SSE-NEXT: movq %rax, 48(%rdi) 136; SSE-NEXT: movq %rax, 40(%rdi) 137; SSE-NEXT: movq %rax, 32(%rdi) 138; SSE-NEXT: movq %rax, 24(%rdi) 139; SSE-NEXT: movq %rax, 16(%rdi) 140; SSE-NEXT: movq %rax, 8(%rdi) 141; SSE-NEXT: movq %rax, (%rdi) 142; SSE-NEXT: retq 143; 144; SSE2FAST-LABEL: memset_128_nonzero_bytes: 145; SSE2FAST: # %bb.0: 146; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 147; SSE2FAST-NEXT: movups %xmm0, 112(%rdi) 148; SSE2FAST-NEXT: movups %xmm0, 96(%rdi) 149; SSE2FAST-NEXT: movups %xmm0, 80(%rdi) 150; SSE2FAST-NEXT: movups %xmm0, 64(%rdi) 151; SSE2FAST-NEXT: movups %xmm0, 48(%rdi) 152; SSE2FAST-NEXT: movups %xmm0, 32(%rdi) 153; SSE2FAST-NEXT: movups %xmm0, 16(%rdi) 154; SSE2FAST-NEXT: movups %xmm0, (%rdi) 155; SSE2FAST-NEXT: retq 156; 157; AVX1-LABEL: memset_128_nonzero_bytes: 158; AVX1: # %bb.0: 159; AVX1-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 160; AVX1-NEXT: vmovups %ymm0, 96(%rdi) 161; AVX1-NEXT: vmovups %ymm0, 64(%rdi) 162; AVX1-NEXT: vmovups %ymm0, 32(%rdi) 163; AVX1-NEXT: vmovups %ymm0, (%rdi) 164; AVX1-NEXT: vzeroupper 165; AVX1-NEXT: retq 166; 167; AVX2-LABEL: memset_128_nonzero_bytes: 168; AVX2: # %bb.0: 169; AVX2-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 170; AVX2-NEXT: vmovups %ymm0, 96(%rdi) 171; AVX2-NEXT: vmovups %ymm0, 64(%rdi) 172; AVX2-NEXT: vmovups %ymm0, 32(%rdi) 173; AVX2-NEXT: vmovups %ymm0, (%rdi) 174; AVX2-NEXT: vzeroupper 175; AVX2-NEXT: retq 176; 177; AVX512F-LABEL: memset_128_nonzero_bytes: 178; AVX512F: # %bb.0: 179; AVX512F-NEXT: vbroadcastss {{.*#+}} zmm0 = [707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378] 180; AVX512F-NEXT: vmovups %zmm0, 64(%rdi) 181; AVX512F-NEXT: vmovups %zmm0, (%rdi) 182; AVX512F-NEXT: vzeroupper 183; AVX512F-NEXT: retq 184; 185; AVX512BW-LABEL: memset_128_nonzero_bytes: 186; AVX512BW: # %bb.0: 187; AVX512BW-NEXT: vbroadcastss {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 188; AVX512BW-NEXT: vmovups %zmm0, 64(%rdi) 189; AVX512BW-NEXT: vmovups %zmm0, (%rdi) 190; AVX512BW-NEXT: vzeroupper 191; AVX512BW-NEXT: retq 192 %call = tail call ptr @__memset_chk(ptr %x, i32 42, i64 128, i64 -1) 193 ret void 194} 195 196define void @memset_256_nonzero_bytes(ptr %x) { 197; SSE-LABEL: memset_256_nonzero_bytes: 198; SSE: # %bb.0: 199; SSE-NEXT: movl $256, %edx # imm = 0x100 200; SSE-NEXT: movl $42, %esi 201; SSE-NEXT: jmp memset@PLT # TAILCALL 202; 203; SSE2FAST-LABEL: memset_256_nonzero_bytes: 204; SSE2FAST: # %bb.0: 205; SSE2FAST-NEXT: movaps {{.*#+}} xmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 206; SSE2FAST-NEXT: movups %xmm0, 240(%rdi) 207; SSE2FAST-NEXT: movups %xmm0, 224(%rdi) 208; SSE2FAST-NEXT: movups %xmm0, 208(%rdi) 209; SSE2FAST-NEXT: movups %xmm0, 192(%rdi) 210; SSE2FAST-NEXT: movups %xmm0, 176(%rdi) 211; SSE2FAST-NEXT: movups %xmm0, 160(%rdi) 212; SSE2FAST-NEXT: movups %xmm0, 144(%rdi) 213; SSE2FAST-NEXT: movups %xmm0, 128(%rdi) 214; SSE2FAST-NEXT: movups %xmm0, 112(%rdi) 215; SSE2FAST-NEXT: movups %xmm0, 96(%rdi) 216; SSE2FAST-NEXT: movups %xmm0, 80(%rdi) 217; SSE2FAST-NEXT: movups %xmm0, 64(%rdi) 218; SSE2FAST-NEXT: movups %xmm0, 48(%rdi) 219; SSE2FAST-NEXT: movups %xmm0, 32(%rdi) 220; SSE2FAST-NEXT: movups %xmm0, 16(%rdi) 221; SSE2FAST-NEXT: movups %xmm0, (%rdi) 222; SSE2FAST-NEXT: retq 223; 224; AVX1-LABEL: memset_256_nonzero_bytes: 225; AVX1: # %bb.0: 226; AVX1-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 227; AVX1-NEXT: vmovups %ymm0, 224(%rdi) 228; AVX1-NEXT: vmovups %ymm0, 192(%rdi) 229; AVX1-NEXT: vmovups %ymm0, 160(%rdi) 230; AVX1-NEXT: vmovups %ymm0, 128(%rdi) 231; AVX1-NEXT: vmovups %ymm0, 96(%rdi) 232; AVX1-NEXT: vmovups %ymm0, 64(%rdi) 233; AVX1-NEXT: vmovups %ymm0, 32(%rdi) 234; AVX1-NEXT: vmovups %ymm0, (%rdi) 235; AVX1-NEXT: vzeroupper 236; AVX1-NEXT: retq 237; 238; AVX2-LABEL: memset_256_nonzero_bytes: 239; AVX2: # %bb.0: 240; AVX2-NEXT: vbroadcastss {{.*#+}} ymm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 241; AVX2-NEXT: vmovups %ymm0, 224(%rdi) 242; AVX2-NEXT: vmovups %ymm0, 192(%rdi) 243; AVX2-NEXT: vmovups %ymm0, 160(%rdi) 244; AVX2-NEXT: vmovups %ymm0, 128(%rdi) 245; AVX2-NEXT: vmovups %ymm0, 96(%rdi) 246; AVX2-NEXT: vmovups %ymm0, 64(%rdi) 247; AVX2-NEXT: vmovups %ymm0, 32(%rdi) 248; AVX2-NEXT: vmovups %ymm0, (%rdi) 249; AVX2-NEXT: vzeroupper 250; AVX2-NEXT: retq 251; 252; AVX512F-LABEL: memset_256_nonzero_bytes: 253; AVX512F: # %bb.0: 254; AVX512F-NEXT: vbroadcastss {{.*#+}} zmm0 = [707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378,707406378] 255; AVX512F-NEXT: vmovups %zmm0, 192(%rdi) 256; AVX512F-NEXT: vmovups %zmm0, 128(%rdi) 257; AVX512F-NEXT: vmovups %zmm0, 64(%rdi) 258; AVX512F-NEXT: vmovups %zmm0, (%rdi) 259; AVX512F-NEXT: vzeroupper 260; AVX512F-NEXT: retq 261; 262; AVX512BW-LABEL: memset_256_nonzero_bytes: 263; AVX512BW: # %bb.0: 264; AVX512BW-NEXT: vbroadcastss {{.*#+}} zmm0 = [42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42,42] 265; AVX512BW-NEXT: vmovups %zmm0, 192(%rdi) 266; AVX512BW-NEXT: vmovups %zmm0, 128(%rdi) 267; AVX512BW-NEXT: vmovups %zmm0, 64(%rdi) 268; AVX512BW-NEXT: vmovups %zmm0, (%rdi) 269; AVX512BW-NEXT: vzeroupper 270; AVX512BW-NEXT: retq 271 %call = tail call ptr @__memset_chk(ptr %x, i32 42, i64 256, i64 -1) 272 ret void 273} 274 275declare ptr @__memset_chk(ptr, i32, i64, i64) 276 277; Repeat with a non-constant value for the stores. 278 279define void @memset_16_nonconst_bytes(ptr %x, i8 %c) { 280; SSE-LABEL: memset_16_nonconst_bytes: 281; SSE: # %bb.0: 282; SSE-NEXT: movzbl %sil, %eax 283; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101 284; SSE-NEXT: imulq %rax, %rcx 285; SSE-NEXT: movq %rcx, 8(%rdi) 286; SSE-NEXT: movq %rcx, (%rdi) 287; SSE-NEXT: retq 288; 289; SSE2FAST-LABEL: memset_16_nonconst_bytes: 290; SSE2FAST: # %bb.0: 291; SSE2FAST-NEXT: movd %esi, %xmm0 292; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 293; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 294; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 295; SSE2FAST-NEXT: movdqu %xmm0, (%rdi) 296; SSE2FAST-NEXT: retq 297; 298; AVX1-LABEL: memset_16_nonconst_bytes: 299; AVX1: # %bb.0: 300; AVX1-NEXT: vmovd %esi, %xmm0 301; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 302; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 303; AVX1-NEXT: vmovdqu %xmm0, (%rdi) 304; AVX1-NEXT: retq 305; 306; AVX2-LABEL: memset_16_nonconst_bytes: 307; AVX2: # %bb.0: 308; AVX2-NEXT: vmovd %esi, %xmm0 309; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 310; AVX2-NEXT: vmovdqu %xmm0, (%rdi) 311; AVX2-NEXT: retq 312; 313; AVX512-LABEL: memset_16_nonconst_bytes: 314; AVX512: # %bb.0: 315; AVX512-NEXT: vmovd %esi, %xmm0 316; AVX512-NEXT: vpbroadcastb %xmm0, %xmm0 317; AVX512-NEXT: vmovdqu %xmm0, (%rdi) 318; AVX512-NEXT: retq 319 tail call void @llvm.memset.p0.i64(ptr %x, i8 %c, i64 16, i1 false) 320 ret void 321} 322 323define void @memset_32_nonconst_bytes(ptr %x, i8 %c) { 324; SSE-LABEL: memset_32_nonconst_bytes: 325; SSE: # %bb.0: 326; SSE-NEXT: movzbl %sil, %eax 327; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101 328; SSE-NEXT: imulq %rax, %rcx 329; SSE-NEXT: movq %rcx, 24(%rdi) 330; SSE-NEXT: movq %rcx, 16(%rdi) 331; SSE-NEXT: movq %rcx, 8(%rdi) 332; SSE-NEXT: movq %rcx, (%rdi) 333; SSE-NEXT: retq 334; 335; SSE2FAST-LABEL: memset_32_nonconst_bytes: 336; SSE2FAST: # %bb.0: 337; SSE2FAST-NEXT: movd %esi, %xmm0 338; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 339; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 340; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 341; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi) 342; SSE2FAST-NEXT: movdqu %xmm0, (%rdi) 343; SSE2FAST-NEXT: retq 344; 345; AVX1-LABEL: memset_32_nonconst_bytes: 346; AVX1: # %bb.0: 347; AVX1-NEXT: vmovd %esi, %xmm0 348; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 349; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 350; AVX1-NEXT: vmovdqu %xmm0, 16(%rdi) 351; AVX1-NEXT: vmovdqu %xmm0, (%rdi) 352; AVX1-NEXT: retq 353; 354; AVX2-LABEL: memset_32_nonconst_bytes: 355; AVX2: # %bb.0: 356; AVX2-NEXT: vmovd %esi, %xmm0 357; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 358; AVX2-NEXT: vmovdqu %ymm0, (%rdi) 359; AVX2-NEXT: vzeroupper 360; AVX2-NEXT: retq 361; 362; AVX512-LABEL: memset_32_nonconst_bytes: 363; AVX512: # %bb.0: 364; AVX512-NEXT: vmovd %esi, %xmm0 365; AVX512-NEXT: vpbroadcastb %xmm0, %ymm0 366; AVX512-NEXT: vmovdqu %ymm0, (%rdi) 367; AVX512-NEXT: vzeroupper 368; AVX512-NEXT: retq 369 tail call void @llvm.memset.p0.i64(ptr %x, i8 %c, i64 32, i1 false) 370 ret void 371} 372 373define void @memset_64_nonconst_bytes(ptr %x, i8 %c) { 374; SSE-LABEL: memset_64_nonconst_bytes: 375; SSE: # %bb.0: 376; SSE-NEXT: movzbl %sil, %eax 377; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101 378; SSE-NEXT: imulq %rax, %rcx 379; SSE-NEXT: movq %rcx, 56(%rdi) 380; SSE-NEXT: movq %rcx, 48(%rdi) 381; SSE-NEXT: movq %rcx, 40(%rdi) 382; SSE-NEXT: movq %rcx, 32(%rdi) 383; SSE-NEXT: movq %rcx, 24(%rdi) 384; SSE-NEXT: movq %rcx, 16(%rdi) 385; SSE-NEXT: movq %rcx, 8(%rdi) 386; SSE-NEXT: movq %rcx, (%rdi) 387; SSE-NEXT: retq 388; 389; SSE2FAST-LABEL: memset_64_nonconst_bytes: 390; SSE2FAST: # %bb.0: 391; SSE2FAST-NEXT: movd %esi, %xmm0 392; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 393; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 394; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 395; SSE2FAST-NEXT: movdqu %xmm0, 48(%rdi) 396; SSE2FAST-NEXT: movdqu %xmm0, 32(%rdi) 397; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi) 398; SSE2FAST-NEXT: movdqu %xmm0, (%rdi) 399; SSE2FAST-NEXT: retq 400; 401; AVX1-LABEL: memset_64_nonconst_bytes: 402; AVX1: # %bb.0: 403; AVX1-NEXT: vmovd %esi, %xmm0 404; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 405; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 406; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 407; AVX1-NEXT: vmovups %ymm0, 32(%rdi) 408; AVX1-NEXT: vmovups %ymm0, (%rdi) 409; AVX1-NEXT: vzeroupper 410; AVX1-NEXT: retq 411; 412; AVX2-LABEL: memset_64_nonconst_bytes: 413; AVX2: # %bb.0: 414; AVX2-NEXT: vmovd %esi, %xmm0 415; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 416; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi) 417; AVX2-NEXT: vmovdqu %ymm0, (%rdi) 418; AVX2-NEXT: vzeroupper 419; AVX2-NEXT: retq 420; 421; AVX512F-LABEL: memset_64_nonconst_bytes: 422; AVX512F: # %bb.0: 423; AVX512F-NEXT: movzbl %sil, %eax 424; AVX512F-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101 425; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 426; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi) 427; AVX512F-NEXT: vzeroupper 428; AVX512F-NEXT: retq 429; 430; AVX512BW-LABEL: memset_64_nonconst_bytes: 431; AVX512BW: # %bb.0: 432; AVX512BW-NEXT: vpbroadcastb %esi, %zmm0 433; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rdi) 434; AVX512BW-NEXT: vzeroupper 435; AVX512BW-NEXT: retq 436 tail call void @llvm.memset.p0.i64(ptr %x, i8 %c, i64 64, i1 false) 437 ret void 438} 439 440define void @memset_128_nonconst_bytes(ptr %x, i8 %c) { 441; SSE-LABEL: memset_128_nonconst_bytes: 442; SSE: # %bb.0: 443; SSE-NEXT: movzbl %sil, %eax 444; SSE-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101 445; SSE-NEXT: imulq %rax, %rcx 446; SSE-NEXT: movq %rcx, 120(%rdi) 447; SSE-NEXT: movq %rcx, 112(%rdi) 448; SSE-NEXT: movq %rcx, 104(%rdi) 449; SSE-NEXT: movq %rcx, 96(%rdi) 450; SSE-NEXT: movq %rcx, 88(%rdi) 451; SSE-NEXT: movq %rcx, 80(%rdi) 452; SSE-NEXT: movq %rcx, 72(%rdi) 453; SSE-NEXT: movq %rcx, 64(%rdi) 454; SSE-NEXT: movq %rcx, 56(%rdi) 455; SSE-NEXT: movq %rcx, 48(%rdi) 456; SSE-NEXT: movq %rcx, 40(%rdi) 457; SSE-NEXT: movq %rcx, 32(%rdi) 458; SSE-NEXT: movq %rcx, 24(%rdi) 459; SSE-NEXT: movq %rcx, 16(%rdi) 460; SSE-NEXT: movq %rcx, 8(%rdi) 461; SSE-NEXT: movq %rcx, (%rdi) 462; SSE-NEXT: retq 463; 464; SSE2FAST-LABEL: memset_128_nonconst_bytes: 465; SSE2FAST: # %bb.0: 466; SSE2FAST-NEXT: movd %esi, %xmm0 467; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 468; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 469; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 470; SSE2FAST-NEXT: movdqu %xmm0, 112(%rdi) 471; SSE2FAST-NEXT: movdqu %xmm0, 96(%rdi) 472; SSE2FAST-NEXT: movdqu %xmm0, 80(%rdi) 473; SSE2FAST-NEXT: movdqu %xmm0, 64(%rdi) 474; SSE2FAST-NEXT: movdqu %xmm0, 48(%rdi) 475; SSE2FAST-NEXT: movdqu %xmm0, 32(%rdi) 476; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi) 477; SSE2FAST-NEXT: movdqu %xmm0, (%rdi) 478; SSE2FAST-NEXT: retq 479; 480; AVX1-LABEL: memset_128_nonconst_bytes: 481; AVX1: # %bb.0: 482; AVX1-NEXT: vmovd %esi, %xmm0 483; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 484; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 485; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 486; AVX1-NEXT: vmovups %ymm0, 96(%rdi) 487; AVX1-NEXT: vmovups %ymm0, 64(%rdi) 488; AVX1-NEXT: vmovups %ymm0, 32(%rdi) 489; AVX1-NEXT: vmovups %ymm0, (%rdi) 490; AVX1-NEXT: vzeroupper 491; AVX1-NEXT: retq 492; 493; AVX2-LABEL: memset_128_nonconst_bytes: 494; AVX2: # %bb.0: 495; AVX2-NEXT: vmovd %esi, %xmm0 496; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 497; AVX2-NEXT: vmovdqu %ymm0, 96(%rdi) 498; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi) 499; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi) 500; AVX2-NEXT: vmovdqu %ymm0, (%rdi) 501; AVX2-NEXT: vzeroupper 502; AVX2-NEXT: retq 503; 504; AVX512F-LABEL: memset_128_nonconst_bytes: 505; AVX512F: # %bb.0: 506; AVX512F-NEXT: movzbl %sil, %eax 507; AVX512F-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101 508; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 509; AVX512F-NEXT: vmovdqu64 %zmm0, 64(%rdi) 510; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi) 511; AVX512F-NEXT: vzeroupper 512; AVX512F-NEXT: retq 513; 514; AVX512BW-LABEL: memset_128_nonconst_bytes: 515; AVX512BW: # %bb.0: 516; AVX512BW-NEXT: vpbroadcastb %esi, %zmm0 517; AVX512BW-NEXT: vmovdqu64 %zmm0, 64(%rdi) 518; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rdi) 519; AVX512BW-NEXT: vzeroupper 520; AVX512BW-NEXT: retq 521 tail call void @llvm.memset.p0.i64(ptr %x, i8 %c, i64 128, i1 false) 522 ret void 523} 524 525define void @memset_256_nonconst_bytes(ptr %x, i8 %c) { 526; SSE-LABEL: memset_256_nonconst_bytes: 527; SSE: # %bb.0: 528; SSE-NEXT: movl $256, %edx # imm = 0x100 529; SSE-NEXT: jmp memset@PLT # TAILCALL 530; 531; SSE2FAST-LABEL: memset_256_nonconst_bytes: 532; SSE2FAST: # %bb.0: 533; SSE2FAST-NEXT: movd %esi, %xmm0 534; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 535; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] 536; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 537; SSE2FAST-NEXT: movdqu %xmm0, 240(%rdi) 538; SSE2FAST-NEXT: movdqu %xmm0, 224(%rdi) 539; SSE2FAST-NEXT: movdqu %xmm0, 208(%rdi) 540; SSE2FAST-NEXT: movdqu %xmm0, 192(%rdi) 541; SSE2FAST-NEXT: movdqu %xmm0, 176(%rdi) 542; SSE2FAST-NEXT: movdqu %xmm0, 160(%rdi) 543; SSE2FAST-NEXT: movdqu %xmm0, 144(%rdi) 544; SSE2FAST-NEXT: movdqu %xmm0, 128(%rdi) 545; SSE2FAST-NEXT: movdqu %xmm0, 112(%rdi) 546; SSE2FAST-NEXT: movdqu %xmm0, 96(%rdi) 547; SSE2FAST-NEXT: movdqu %xmm0, 80(%rdi) 548; SSE2FAST-NEXT: movdqu %xmm0, 64(%rdi) 549; SSE2FAST-NEXT: movdqu %xmm0, 48(%rdi) 550; SSE2FAST-NEXT: movdqu %xmm0, 32(%rdi) 551; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi) 552; SSE2FAST-NEXT: movdqu %xmm0, (%rdi) 553; SSE2FAST-NEXT: retq 554; 555; AVX1-LABEL: memset_256_nonconst_bytes: 556; AVX1: # %bb.0: 557; AVX1-NEXT: vmovd %esi, %xmm0 558; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 559; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 560; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 561; AVX1-NEXT: vmovups %ymm0, 224(%rdi) 562; AVX1-NEXT: vmovups %ymm0, 192(%rdi) 563; AVX1-NEXT: vmovups %ymm0, 160(%rdi) 564; AVX1-NEXT: vmovups %ymm0, 128(%rdi) 565; AVX1-NEXT: vmovups %ymm0, 96(%rdi) 566; AVX1-NEXT: vmovups %ymm0, 64(%rdi) 567; AVX1-NEXT: vmovups %ymm0, 32(%rdi) 568; AVX1-NEXT: vmovups %ymm0, (%rdi) 569; AVX1-NEXT: vzeroupper 570; AVX1-NEXT: retq 571; 572; AVX2-LABEL: memset_256_nonconst_bytes: 573; AVX2: # %bb.0: 574; AVX2-NEXT: vmovd %esi, %xmm0 575; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 576; AVX2-NEXT: vmovdqu %ymm0, 224(%rdi) 577; AVX2-NEXT: vmovdqu %ymm0, 192(%rdi) 578; AVX2-NEXT: vmovdqu %ymm0, 160(%rdi) 579; AVX2-NEXT: vmovdqu %ymm0, 128(%rdi) 580; AVX2-NEXT: vmovdqu %ymm0, 96(%rdi) 581; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi) 582; AVX2-NEXT: vmovdqu %ymm0, 32(%rdi) 583; AVX2-NEXT: vmovdqu %ymm0, (%rdi) 584; AVX2-NEXT: vzeroupper 585; AVX2-NEXT: retq 586; 587; AVX512F-LABEL: memset_256_nonconst_bytes: 588; AVX512F: # %bb.0: 589; AVX512F-NEXT: movzbl %sil, %eax 590; AVX512F-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101 591; AVX512F-NEXT: vpbroadcastd %eax, %zmm0 592; AVX512F-NEXT: vmovdqu64 %zmm0, 192(%rdi) 593; AVX512F-NEXT: vmovdqu64 %zmm0, 128(%rdi) 594; AVX512F-NEXT: vmovdqu64 %zmm0, 64(%rdi) 595; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi) 596; AVX512F-NEXT: vzeroupper 597; AVX512F-NEXT: retq 598; 599; AVX512BW-LABEL: memset_256_nonconst_bytes: 600; AVX512BW: # %bb.0: 601; AVX512BW-NEXT: vpbroadcastb %esi, %zmm0 602; AVX512BW-NEXT: vmovdqu64 %zmm0, 192(%rdi) 603; AVX512BW-NEXT: vmovdqu64 %zmm0, 128(%rdi) 604; AVX512BW-NEXT: vmovdqu64 %zmm0, 64(%rdi) 605; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rdi) 606; AVX512BW-NEXT: vzeroupper 607; AVX512BW-NEXT: retq 608 tail call void @llvm.memset.p0.i64(ptr %x, i8 %c, i64 256, i1 false) 609 ret void 610} 611 612declare void @llvm.memset.p0.i64(ptr nocapture, i8, i64, i1) #1 613 614