1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=WIDEN_SKX 3; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=WIDEN_KNL 4; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake < %s | FileCheck %s --check-prefix=WIDEN_AVX2 5 6define <2 x double> @test_gather_v2i32_index(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) { 7; WIDEN_SKX-LABEL: test_gather_v2i32_index: 8; WIDEN_SKX: # %bb.0: 9; WIDEN_SKX-NEXT: vpsllq $63, %xmm1, %xmm1 10; WIDEN_SKX-NEXT: vpmovq2m %xmm1, %k0 11; WIDEN_SKX-NEXT: vpbroadcastq %rdi, %xmm1 12; WIDEN_SKX-NEXT: vpmovsxdq %xmm0, %xmm0 13; WIDEN_SKX-NEXT: vpsllq $3, %xmm0, %xmm0 14; WIDEN_SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 15; WIDEN_SKX-NEXT: kmovw %k0, %eax 16; WIDEN_SKX-NEXT: testb $1, %al 17; WIDEN_SKX-NEXT: jne .LBB0_1 18; WIDEN_SKX-NEXT: # %bb.2: # %else 19; WIDEN_SKX-NEXT: testb $2, %al 20; WIDEN_SKX-NEXT: jne .LBB0_3 21; WIDEN_SKX-NEXT: .LBB0_4: # %else2 22; WIDEN_SKX-NEXT: vmovaps %xmm2, %xmm0 23; WIDEN_SKX-NEXT: retq 24; WIDEN_SKX-NEXT: .LBB0_1: # %cond.load 25; WIDEN_SKX-NEXT: vmovq %xmm0, %rcx 26; WIDEN_SKX-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] 27; WIDEN_SKX-NEXT: testb $2, %al 28; WIDEN_SKX-NEXT: je .LBB0_4 29; WIDEN_SKX-NEXT: .LBB0_3: # %cond.load1 30; WIDEN_SKX-NEXT: vpextrq $1, %xmm0, %rax 31; WIDEN_SKX-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] 32; WIDEN_SKX-NEXT: vmovaps %xmm2, %xmm0 33; WIDEN_SKX-NEXT: retq 34; 35; WIDEN_KNL-LABEL: test_gather_v2i32_index: 36; WIDEN_KNL: # %bb.0: 37; WIDEN_KNL-NEXT: vpsllq $63, %xmm1, %xmm1 38; WIDEN_KNL-NEXT: vptestmq %zmm1, %zmm1, %k0 39; WIDEN_KNL-NEXT: vpmovsxdq %xmm0, %xmm0 40; WIDEN_KNL-NEXT: vpsllq $3, %xmm0, %xmm0 41; WIDEN_KNL-NEXT: vmovq %rdi, %xmm1 42; WIDEN_KNL-NEXT: vpbroadcastq %xmm1, %xmm1 43; WIDEN_KNL-NEXT: vpaddq %xmm0, %xmm1, %xmm0 44; WIDEN_KNL-NEXT: kmovw %k0, %eax 45; WIDEN_KNL-NEXT: testb $1, %al 46; WIDEN_KNL-NEXT: jne .LBB0_1 47; WIDEN_KNL-NEXT: # %bb.2: # %else 48; WIDEN_KNL-NEXT: testb $2, %al 49; WIDEN_KNL-NEXT: jne .LBB0_3 50; WIDEN_KNL-NEXT: .LBB0_4: # %else2 51; WIDEN_KNL-NEXT: vmovaps %xmm2, %xmm0 52; WIDEN_KNL-NEXT: vzeroupper 53; WIDEN_KNL-NEXT: retq 54; WIDEN_KNL-NEXT: .LBB0_1: # %cond.load 55; WIDEN_KNL-NEXT: vmovq %xmm0, %rcx 56; WIDEN_KNL-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] 57; WIDEN_KNL-NEXT: testb $2, %al 58; WIDEN_KNL-NEXT: je .LBB0_4 59; WIDEN_KNL-NEXT: .LBB0_3: # %cond.load1 60; WIDEN_KNL-NEXT: vpextrq $1, %xmm0, %rax 61; WIDEN_KNL-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] 62; WIDEN_KNL-NEXT: vmovaps %xmm2, %xmm0 63; WIDEN_KNL-NEXT: vzeroupper 64; WIDEN_KNL-NEXT: retq 65; 66; WIDEN_AVX2-LABEL: test_gather_v2i32_index: 67; WIDEN_AVX2: # %bb.0: 68; WIDEN_AVX2-NEXT: vpsllq $63, %xmm1, %xmm1 69; WIDEN_AVX2-NEXT: vgatherdpd %xmm1, (%rdi,%xmm0,8), %xmm2 70; WIDEN_AVX2-NEXT: vmovapd %xmm2, %xmm0 71; WIDEN_AVX2-NEXT: retq 72 %gep.random = getelementptr double, ptr %base, <2 x i32> %ind 73 %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0) 74 ret <2 x double> %res 75} 76 77define void @test_scatter_v2i32_index(<2 x double> %a1, ptr %base, <2 x i32> %ind, <2 x i1> %mask) { 78; WIDEN_SKX-LABEL: test_scatter_v2i32_index: 79; WIDEN_SKX: # %bb.0: 80; WIDEN_SKX-NEXT: vpsllq $63, %xmm2, %xmm2 81; WIDEN_SKX-NEXT: vpmovq2m %xmm2, %k0 82; WIDEN_SKX-NEXT: vpbroadcastq %rdi, %xmm2 83; WIDEN_SKX-NEXT: vpmovsxdq %xmm1, %xmm1 84; WIDEN_SKX-NEXT: vpsllq $3, %xmm1, %xmm1 85; WIDEN_SKX-NEXT: vpaddq %xmm1, %xmm2, %xmm1 86; WIDEN_SKX-NEXT: kmovw %k0, %eax 87; WIDEN_SKX-NEXT: testb $1, %al 88; WIDEN_SKX-NEXT: jne .LBB1_1 89; WIDEN_SKX-NEXT: # %bb.2: # %else 90; WIDEN_SKX-NEXT: testb $2, %al 91; WIDEN_SKX-NEXT: jne .LBB1_3 92; WIDEN_SKX-NEXT: .LBB1_4: # %else2 93; WIDEN_SKX-NEXT: retq 94; WIDEN_SKX-NEXT: .LBB1_1: # %cond.store 95; WIDEN_SKX-NEXT: vmovq %xmm1, %rcx 96; WIDEN_SKX-NEXT: vmovlps %xmm0, (%rcx) 97; WIDEN_SKX-NEXT: testb $2, %al 98; WIDEN_SKX-NEXT: je .LBB1_4 99; WIDEN_SKX-NEXT: .LBB1_3: # %cond.store1 100; WIDEN_SKX-NEXT: vpextrq $1, %xmm1, %rax 101; WIDEN_SKX-NEXT: vmovhps %xmm0, (%rax) 102; WIDEN_SKX-NEXT: retq 103; 104; WIDEN_KNL-LABEL: test_scatter_v2i32_index: 105; WIDEN_KNL: # %bb.0: 106; WIDEN_KNL-NEXT: vpsllq $63, %xmm2, %xmm2 107; WIDEN_KNL-NEXT: vptestmq %zmm2, %zmm2, %k0 108; WIDEN_KNL-NEXT: vpmovsxdq %xmm1, %xmm1 109; WIDEN_KNL-NEXT: vpsllq $3, %xmm1, %xmm1 110; WIDEN_KNL-NEXT: vmovq %rdi, %xmm2 111; WIDEN_KNL-NEXT: vpbroadcastq %xmm2, %xmm2 112; WIDEN_KNL-NEXT: vpaddq %xmm1, %xmm2, %xmm1 113; WIDEN_KNL-NEXT: kmovw %k0, %eax 114; WIDEN_KNL-NEXT: testb $1, %al 115; WIDEN_KNL-NEXT: jne .LBB1_1 116; WIDEN_KNL-NEXT: # %bb.2: # %else 117; WIDEN_KNL-NEXT: testb $2, %al 118; WIDEN_KNL-NEXT: jne .LBB1_3 119; WIDEN_KNL-NEXT: .LBB1_4: # %else2 120; WIDEN_KNL-NEXT: vzeroupper 121; WIDEN_KNL-NEXT: retq 122; WIDEN_KNL-NEXT: .LBB1_1: # %cond.store 123; WIDEN_KNL-NEXT: vmovq %xmm1, %rcx 124; WIDEN_KNL-NEXT: vmovlps %xmm0, (%rcx) 125; WIDEN_KNL-NEXT: testb $2, %al 126; WIDEN_KNL-NEXT: je .LBB1_4 127; WIDEN_KNL-NEXT: .LBB1_3: # %cond.store1 128; WIDEN_KNL-NEXT: vpextrq $1, %xmm1, %rax 129; WIDEN_KNL-NEXT: vmovhps %xmm0, (%rax) 130; WIDEN_KNL-NEXT: vzeroupper 131; WIDEN_KNL-NEXT: retq 132; 133; WIDEN_AVX2-LABEL: test_scatter_v2i32_index: 134; WIDEN_AVX2: # %bb.0: 135; WIDEN_AVX2-NEXT: vpmovsxdq %xmm1, %xmm1 136; WIDEN_AVX2-NEXT: vpsllq $3, %xmm1, %xmm1 137; WIDEN_AVX2-NEXT: vmovq %rdi, %xmm3 138; WIDEN_AVX2-NEXT: vpbroadcastq %xmm3, %xmm3 139; WIDEN_AVX2-NEXT: vpaddq %xmm1, %xmm3, %xmm1 140; WIDEN_AVX2-NEXT: vpsllq $63, %xmm2, %xmm2 141; WIDEN_AVX2-NEXT: vmovmskpd %xmm2, %eax 142; WIDEN_AVX2-NEXT: testb $1, %al 143; WIDEN_AVX2-NEXT: jne .LBB1_1 144; WIDEN_AVX2-NEXT: # %bb.2: # %else 145; WIDEN_AVX2-NEXT: testb $2, %al 146; WIDEN_AVX2-NEXT: jne .LBB1_3 147; WIDEN_AVX2-NEXT: .LBB1_4: # %else2 148; WIDEN_AVX2-NEXT: retq 149; WIDEN_AVX2-NEXT: .LBB1_1: # %cond.store 150; WIDEN_AVX2-NEXT: vmovq %xmm1, %rcx 151; WIDEN_AVX2-NEXT: vmovlps %xmm0, (%rcx) 152; WIDEN_AVX2-NEXT: testb $2, %al 153; WIDEN_AVX2-NEXT: je .LBB1_4 154; WIDEN_AVX2-NEXT: .LBB1_3: # %cond.store1 155; WIDEN_AVX2-NEXT: vpextrq $1, %xmm1, %rax 156; WIDEN_AVX2-NEXT: vmovhps %xmm0, (%rax) 157; WIDEN_AVX2-NEXT: retq 158 %gep = getelementptr double, ptr%base, <2 x i32> %ind 159 call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> %a1, <2 x ptr> %gep, i32 4, <2 x i1> %mask) 160 ret void 161} 162 163define <2 x i32> @test_gather_v2i32_data(<2 x ptr> %ptr, <2 x i1> %mask, <2 x i32> %src0) { 164; WIDEN_SKX-LABEL: test_gather_v2i32_data: 165; WIDEN_SKX: # %bb.0: 166; WIDEN_SKX-NEXT: vpsllq $63, %xmm1, %xmm1 167; WIDEN_SKX-NEXT: vpmovq2m %xmm1, %k0 168; WIDEN_SKX-NEXT: kmovw %k0, %eax 169; WIDEN_SKX-NEXT: testb $1, %al 170; WIDEN_SKX-NEXT: jne .LBB2_1 171; WIDEN_SKX-NEXT: # %bb.2: # %else 172; WIDEN_SKX-NEXT: testb $2, %al 173; WIDEN_SKX-NEXT: jne .LBB2_3 174; WIDEN_SKX-NEXT: .LBB2_4: # %else2 175; WIDEN_SKX-NEXT: vmovdqa %xmm2, %xmm0 176; WIDEN_SKX-NEXT: retq 177; WIDEN_SKX-NEXT: .LBB2_1: # %cond.load 178; WIDEN_SKX-NEXT: vmovq %xmm0, %rcx 179; WIDEN_SKX-NEXT: vpinsrd $0, (%rcx), %xmm2, %xmm2 180; WIDEN_SKX-NEXT: testb $2, %al 181; WIDEN_SKX-NEXT: je .LBB2_4 182; WIDEN_SKX-NEXT: .LBB2_3: # %cond.load1 183; WIDEN_SKX-NEXT: vpextrq $1, %xmm0, %rax 184; WIDEN_SKX-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2 185; WIDEN_SKX-NEXT: vmovdqa %xmm2, %xmm0 186; WIDEN_SKX-NEXT: retq 187; 188; WIDEN_KNL-LABEL: test_gather_v2i32_data: 189; WIDEN_KNL: # %bb.0: 190; WIDEN_KNL-NEXT: vpsllq $63, %xmm1, %xmm1 191; WIDEN_KNL-NEXT: vptestmq %zmm1, %zmm1, %k0 192; WIDEN_KNL-NEXT: kmovw %k0, %eax 193; WIDEN_KNL-NEXT: testb $1, %al 194; WIDEN_KNL-NEXT: jne .LBB2_1 195; WIDEN_KNL-NEXT: # %bb.2: # %else 196; WIDEN_KNL-NEXT: testb $2, %al 197; WIDEN_KNL-NEXT: jne .LBB2_3 198; WIDEN_KNL-NEXT: .LBB2_4: # %else2 199; WIDEN_KNL-NEXT: vmovdqa %xmm2, %xmm0 200; WIDEN_KNL-NEXT: vzeroupper 201; WIDEN_KNL-NEXT: retq 202; WIDEN_KNL-NEXT: .LBB2_1: # %cond.load 203; WIDEN_KNL-NEXT: vmovq %xmm0, %rcx 204; WIDEN_KNL-NEXT: vpinsrd $0, (%rcx), %xmm2, %xmm2 205; WIDEN_KNL-NEXT: testb $2, %al 206; WIDEN_KNL-NEXT: je .LBB2_4 207; WIDEN_KNL-NEXT: .LBB2_3: # %cond.load1 208; WIDEN_KNL-NEXT: vpextrq $1, %xmm0, %rax 209; WIDEN_KNL-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2 210; WIDEN_KNL-NEXT: vmovdqa %xmm2, %xmm0 211; WIDEN_KNL-NEXT: vzeroupper 212; WIDEN_KNL-NEXT: retq 213; 214; WIDEN_AVX2-LABEL: test_gather_v2i32_data: 215; WIDEN_AVX2: # %bb.0: 216; WIDEN_AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 217; WIDEN_AVX2-NEXT: vpslld $31, %xmm1, %xmm1 218; WIDEN_AVX2-NEXT: vpgatherqd %xmm1, (,%xmm0), %xmm2 219; WIDEN_AVX2-NEXT: vmovdqa %xmm2, %xmm0 220; WIDEN_AVX2-NEXT: retq 221 %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> %ptr, i32 4, <2 x i1> %mask, <2 x i32> %src0) 222 ret <2 x i32>%res 223} 224 225define void @test_scatter_v2i32_data(<2 x i32>%a1, <2 x ptr> %ptr, <2 x i1>%mask) { 226; WIDEN_SKX-LABEL: test_scatter_v2i32_data: 227; WIDEN_SKX: # %bb.0: 228; WIDEN_SKX-NEXT: vpsllq $63, %xmm2, %xmm2 229; WIDEN_SKX-NEXT: vpmovq2m %xmm2, %k0 230; WIDEN_SKX-NEXT: kmovw %k0, %eax 231; WIDEN_SKX-NEXT: testb $1, %al 232; WIDEN_SKX-NEXT: jne .LBB3_1 233; WIDEN_SKX-NEXT: # %bb.2: # %else 234; WIDEN_SKX-NEXT: testb $2, %al 235; WIDEN_SKX-NEXT: jne .LBB3_3 236; WIDEN_SKX-NEXT: .LBB3_4: # %else2 237; WIDEN_SKX-NEXT: retq 238; WIDEN_SKX-NEXT: .LBB3_1: # %cond.store 239; WIDEN_SKX-NEXT: vmovq %xmm1, %rcx 240; WIDEN_SKX-NEXT: vmovss %xmm0, (%rcx) 241; WIDEN_SKX-NEXT: testb $2, %al 242; WIDEN_SKX-NEXT: je .LBB3_4 243; WIDEN_SKX-NEXT: .LBB3_3: # %cond.store1 244; WIDEN_SKX-NEXT: vpextrq $1, %xmm1, %rax 245; WIDEN_SKX-NEXT: vextractps $1, %xmm0, (%rax) 246; WIDEN_SKX-NEXT: retq 247; 248; WIDEN_KNL-LABEL: test_scatter_v2i32_data: 249; WIDEN_KNL: # %bb.0: 250; WIDEN_KNL-NEXT: vpsllq $63, %xmm2, %xmm2 251; WIDEN_KNL-NEXT: vptestmq %zmm2, %zmm2, %k0 252; WIDEN_KNL-NEXT: kmovw %k0, %eax 253; WIDEN_KNL-NEXT: testb $1, %al 254; WIDEN_KNL-NEXT: jne .LBB3_1 255; WIDEN_KNL-NEXT: # %bb.2: # %else 256; WIDEN_KNL-NEXT: testb $2, %al 257; WIDEN_KNL-NEXT: jne .LBB3_3 258; WIDEN_KNL-NEXT: .LBB3_4: # %else2 259; WIDEN_KNL-NEXT: vzeroupper 260; WIDEN_KNL-NEXT: retq 261; WIDEN_KNL-NEXT: .LBB3_1: # %cond.store 262; WIDEN_KNL-NEXT: vmovq %xmm1, %rcx 263; WIDEN_KNL-NEXT: vmovss %xmm0, (%rcx) 264; WIDEN_KNL-NEXT: testb $2, %al 265; WIDEN_KNL-NEXT: je .LBB3_4 266; WIDEN_KNL-NEXT: .LBB3_3: # %cond.store1 267; WIDEN_KNL-NEXT: vpextrq $1, %xmm1, %rax 268; WIDEN_KNL-NEXT: vextractps $1, %xmm0, (%rax) 269; WIDEN_KNL-NEXT: vzeroupper 270; WIDEN_KNL-NEXT: retq 271; 272; WIDEN_AVX2-LABEL: test_scatter_v2i32_data: 273; WIDEN_AVX2: # %bb.0: 274; WIDEN_AVX2-NEXT: vpsllq $63, %xmm2, %xmm2 275; WIDEN_AVX2-NEXT: vmovmskpd %xmm2, %eax 276; WIDEN_AVX2-NEXT: testb $1, %al 277; WIDEN_AVX2-NEXT: jne .LBB3_1 278; WIDEN_AVX2-NEXT: # %bb.2: # %else 279; WIDEN_AVX2-NEXT: testb $2, %al 280; WIDEN_AVX2-NEXT: jne .LBB3_3 281; WIDEN_AVX2-NEXT: .LBB3_4: # %else2 282; WIDEN_AVX2-NEXT: retq 283; WIDEN_AVX2-NEXT: .LBB3_1: # %cond.store 284; WIDEN_AVX2-NEXT: vmovq %xmm1, %rcx 285; WIDEN_AVX2-NEXT: vmovss %xmm0, (%rcx) 286; WIDEN_AVX2-NEXT: testb $2, %al 287; WIDEN_AVX2-NEXT: je .LBB3_4 288; WIDEN_AVX2-NEXT: .LBB3_3: # %cond.store1 289; WIDEN_AVX2-NEXT: vpextrq $1, %xmm1, %rax 290; WIDEN_AVX2-NEXT: vextractps $1, %xmm0, (%rax) 291; WIDEN_AVX2-NEXT: retq 292 call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> %a1, <2 x ptr> %ptr, i32 4, <2 x i1> %mask) 293 ret void 294} 295 296define <2 x i32> @test_gather_v2i32_data_index(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) { 297; WIDEN_SKX-LABEL: test_gather_v2i32_data_index: 298; WIDEN_SKX: # %bb.0: 299; WIDEN_SKX-NEXT: vpsllq $63, %xmm1, %xmm1 300; WIDEN_SKX-NEXT: vpmovq2m %xmm1, %k0 301; WIDEN_SKX-NEXT: vpbroadcastq %rdi, %xmm1 302; WIDEN_SKX-NEXT: vpmovsxdq %xmm0, %xmm0 303; WIDEN_SKX-NEXT: vpsllq $2, %xmm0, %xmm0 304; WIDEN_SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 305; WIDEN_SKX-NEXT: kmovw %k0, %eax 306; WIDEN_SKX-NEXT: testb $1, %al 307; WIDEN_SKX-NEXT: jne .LBB4_1 308; WIDEN_SKX-NEXT: # %bb.2: # %else 309; WIDEN_SKX-NEXT: testb $2, %al 310; WIDEN_SKX-NEXT: jne .LBB4_3 311; WIDEN_SKX-NEXT: .LBB4_4: # %else2 312; WIDEN_SKX-NEXT: vmovdqa %xmm2, %xmm0 313; WIDEN_SKX-NEXT: retq 314; WIDEN_SKX-NEXT: .LBB4_1: # %cond.load 315; WIDEN_SKX-NEXT: vmovq %xmm0, %rcx 316; WIDEN_SKX-NEXT: vpinsrd $0, (%rcx), %xmm2, %xmm2 317; WIDEN_SKX-NEXT: testb $2, %al 318; WIDEN_SKX-NEXT: je .LBB4_4 319; WIDEN_SKX-NEXT: .LBB4_3: # %cond.load1 320; WIDEN_SKX-NEXT: vpextrq $1, %xmm0, %rax 321; WIDEN_SKX-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2 322; WIDEN_SKX-NEXT: vmovdqa %xmm2, %xmm0 323; WIDEN_SKX-NEXT: retq 324; 325; WIDEN_KNL-LABEL: test_gather_v2i32_data_index: 326; WIDEN_KNL: # %bb.0: 327; WIDEN_KNL-NEXT: vpsllq $63, %xmm1, %xmm1 328; WIDEN_KNL-NEXT: vptestmq %zmm1, %zmm1, %k0 329; WIDEN_KNL-NEXT: vpmovsxdq %xmm0, %xmm0 330; WIDEN_KNL-NEXT: vpsllq $2, %xmm0, %xmm0 331; WIDEN_KNL-NEXT: vmovq %rdi, %xmm1 332; WIDEN_KNL-NEXT: vpbroadcastq %xmm1, %xmm1 333; WIDEN_KNL-NEXT: vpaddq %xmm0, %xmm1, %xmm0 334; WIDEN_KNL-NEXT: kmovw %k0, %eax 335; WIDEN_KNL-NEXT: testb $1, %al 336; WIDEN_KNL-NEXT: jne .LBB4_1 337; WIDEN_KNL-NEXT: # %bb.2: # %else 338; WIDEN_KNL-NEXT: testb $2, %al 339; WIDEN_KNL-NEXT: jne .LBB4_3 340; WIDEN_KNL-NEXT: .LBB4_4: # %else2 341; WIDEN_KNL-NEXT: vmovdqa %xmm2, %xmm0 342; WIDEN_KNL-NEXT: vzeroupper 343; WIDEN_KNL-NEXT: retq 344; WIDEN_KNL-NEXT: .LBB4_1: # %cond.load 345; WIDEN_KNL-NEXT: vmovq %xmm0, %rcx 346; WIDEN_KNL-NEXT: vpinsrd $0, (%rcx), %xmm2, %xmm2 347; WIDEN_KNL-NEXT: testb $2, %al 348; WIDEN_KNL-NEXT: je .LBB4_4 349; WIDEN_KNL-NEXT: .LBB4_3: # %cond.load1 350; WIDEN_KNL-NEXT: vpextrq $1, %xmm0, %rax 351; WIDEN_KNL-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2 352; WIDEN_KNL-NEXT: vmovdqa %xmm2, %xmm0 353; WIDEN_KNL-NEXT: vzeroupper 354; WIDEN_KNL-NEXT: retq 355; 356; WIDEN_AVX2-LABEL: test_gather_v2i32_data_index: 357; WIDEN_AVX2: # %bb.0: 358; WIDEN_AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero 359; WIDEN_AVX2-NEXT: vpslld $31, %xmm1, %xmm1 360; WIDEN_AVX2-NEXT: vpgatherdd %xmm1, (%rdi,%xmm0,4), %xmm2 361; WIDEN_AVX2-NEXT: vmovdqa %xmm2, %xmm0 362; WIDEN_AVX2-NEXT: retq 363 %gep.random = getelementptr i32, ptr %base, <2 x i32> %ind 364 %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0) 365 ret <2 x i32> %res 366} 367 368define void @test_scatter_v2i32_data_index(<2 x i32> %a1, ptr %base, <2 x i32> %ind, <2 x i1> %mask) { 369; WIDEN_SKX-LABEL: test_scatter_v2i32_data_index: 370; WIDEN_SKX: # %bb.0: 371; WIDEN_SKX-NEXT: vpsllq $63, %xmm2, %xmm2 372; WIDEN_SKX-NEXT: vpmovq2m %xmm2, %k0 373; WIDEN_SKX-NEXT: vpbroadcastq %rdi, %xmm2 374; WIDEN_SKX-NEXT: vpmovsxdq %xmm1, %xmm1 375; WIDEN_SKX-NEXT: vpsllq $2, %xmm1, %xmm1 376; WIDEN_SKX-NEXT: vpaddq %xmm1, %xmm2, %xmm1 377; WIDEN_SKX-NEXT: kmovw %k0, %eax 378; WIDEN_SKX-NEXT: testb $1, %al 379; WIDEN_SKX-NEXT: jne .LBB5_1 380; WIDEN_SKX-NEXT: # %bb.2: # %else 381; WIDEN_SKX-NEXT: testb $2, %al 382; WIDEN_SKX-NEXT: jne .LBB5_3 383; WIDEN_SKX-NEXT: .LBB5_4: # %else2 384; WIDEN_SKX-NEXT: retq 385; WIDEN_SKX-NEXT: .LBB5_1: # %cond.store 386; WIDEN_SKX-NEXT: vmovq %xmm1, %rcx 387; WIDEN_SKX-NEXT: vmovss %xmm0, (%rcx) 388; WIDEN_SKX-NEXT: testb $2, %al 389; WIDEN_SKX-NEXT: je .LBB5_4 390; WIDEN_SKX-NEXT: .LBB5_3: # %cond.store1 391; WIDEN_SKX-NEXT: vpextrq $1, %xmm1, %rax 392; WIDEN_SKX-NEXT: vextractps $1, %xmm0, (%rax) 393; WIDEN_SKX-NEXT: retq 394; 395; WIDEN_KNL-LABEL: test_scatter_v2i32_data_index: 396; WIDEN_KNL: # %bb.0: 397; WIDEN_KNL-NEXT: vpsllq $63, %xmm2, %xmm2 398; WIDEN_KNL-NEXT: vptestmq %zmm2, %zmm2, %k0 399; WIDEN_KNL-NEXT: vpmovsxdq %xmm1, %xmm1 400; WIDEN_KNL-NEXT: vpsllq $2, %xmm1, %xmm1 401; WIDEN_KNL-NEXT: vmovq %rdi, %xmm2 402; WIDEN_KNL-NEXT: vpbroadcastq %xmm2, %xmm2 403; WIDEN_KNL-NEXT: vpaddq %xmm1, %xmm2, %xmm1 404; WIDEN_KNL-NEXT: kmovw %k0, %eax 405; WIDEN_KNL-NEXT: testb $1, %al 406; WIDEN_KNL-NEXT: jne .LBB5_1 407; WIDEN_KNL-NEXT: # %bb.2: # %else 408; WIDEN_KNL-NEXT: testb $2, %al 409; WIDEN_KNL-NEXT: jne .LBB5_3 410; WIDEN_KNL-NEXT: .LBB5_4: # %else2 411; WIDEN_KNL-NEXT: vzeroupper 412; WIDEN_KNL-NEXT: retq 413; WIDEN_KNL-NEXT: .LBB5_1: # %cond.store 414; WIDEN_KNL-NEXT: vmovq %xmm1, %rcx 415; WIDEN_KNL-NEXT: vmovss %xmm0, (%rcx) 416; WIDEN_KNL-NEXT: testb $2, %al 417; WIDEN_KNL-NEXT: je .LBB5_4 418; WIDEN_KNL-NEXT: .LBB5_3: # %cond.store1 419; WIDEN_KNL-NEXT: vpextrq $1, %xmm1, %rax 420; WIDEN_KNL-NEXT: vextractps $1, %xmm0, (%rax) 421; WIDEN_KNL-NEXT: vzeroupper 422; WIDEN_KNL-NEXT: retq 423; 424; WIDEN_AVX2-LABEL: test_scatter_v2i32_data_index: 425; WIDEN_AVX2: # %bb.0: 426; WIDEN_AVX2-NEXT: vpmovsxdq %xmm1, %xmm1 427; WIDEN_AVX2-NEXT: vpsllq $2, %xmm1, %xmm1 428; WIDEN_AVX2-NEXT: vmovq %rdi, %xmm3 429; WIDEN_AVX2-NEXT: vpbroadcastq %xmm3, %xmm3 430; WIDEN_AVX2-NEXT: vpaddq %xmm1, %xmm3, %xmm1 431; WIDEN_AVX2-NEXT: vpsllq $63, %xmm2, %xmm2 432; WIDEN_AVX2-NEXT: vmovmskpd %xmm2, %eax 433; WIDEN_AVX2-NEXT: testb $1, %al 434; WIDEN_AVX2-NEXT: jne .LBB5_1 435; WIDEN_AVX2-NEXT: # %bb.2: # %else 436; WIDEN_AVX2-NEXT: testb $2, %al 437; WIDEN_AVX2-NEXT: jne .LBB5_3 438; WIDEN_AVX2-NEXT: .LBB5_4: # %else2 439; WIDEN_AVX2-NEXT: retq 440; WIDEN_AVX2-NEXT: .LBB5_1: # %cond.store 441; WIDEN_AVX2-NEXT: vmovq %xmm1, %rcx 442; WIDEN_AVX2-NEXT: vmovss %xmm0, (%rcx) 443; WIDEN_AVX2-NEXT: testb $2, %al 444; WIDEN_AVX2-NEXT: je .LBB5_4 445; WIDEN_AVX2-NEXT: .LBB5_3: # %cond.store1 446; WIDEN_AVX2-NEXT: vpextrq $1, %xmm1, %rax 447; WIDEN_AVX2-NEXT: vextractps $1, %xmm0, (%rax) 448; WIDEN_AVX2-NEXT: retq 449 %gep = getelementptr i32, ptr%base, <2 x i32> %ind 450 call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> %a1, <2 x ptr> %gep, i32 4, <2 x i1> %mask) 451 ret void 452} 453 454define void @test_mscatter_v17f32(ptr %base, <17 x i32> %index, <17 x float> %val) 455; WIDEN_SKX-LABEL: test_mscatter_v17f32: 456; WIDEN_SKX: # %bb.0: 457; WIDEN_SKX-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3] 458; WIDEN_SKX-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3] 459; WIDEN_SKX-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0] 460; WIDEN_SKX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] 461; WIDEN_SKX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] 462; WIDEN_SKX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] 463; WIDEN_SKX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 464; WIDEN_SKX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 465; WIDEN_SKX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] 466; WIDEN_SKX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] 467; WIDEN_SKX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] 468; WIDEN_SKX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 469; WIDEN_SKX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] 470; WIDEN_SKX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] 471; WIDEN_SKX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] 472; WIDEN_SKX-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 473; WIDEN_SKX-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 474; WIDEN_SKX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 475; WIDEN_SKX-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1 476; WIDEN_SKX-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1 477; WIDEN_SKX-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1 478; WIDEN_SKX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero 479; WIDEN_SKX-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2 480; WIDEN_SKX-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2 481; WIDEN_SKX-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2 482; WIDEN_SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 483; WIDEN_SKX-NEXT: vmovd %esi, %xmm2 484; WIDEN_SKX-NEXT: vpinsrd $1, %edx, %xmm2, %xmm2 485; WIDEN_SKX-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 486; WIDEN_SKX-NEXT: vpinsrd $3, %r8d, %xmm2, %xmm2 487; WIDEN_SKX-NEXT: vmovd %r9d, %xmm3 488; WIDEN_SKX-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm3, %xmm3 489; WIDEN_SKX-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm3, %xmm3 490; WIDEN_SKX-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm3, %xmm3 491; WIDEN_SKX-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 492; WIDEN_SKX-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 493; WIDEN_SKX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 494; WIDEN_SKX-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero 495; WIDEN_SKX-NEXT: kxnorw %k0, %k0, %k1 496; WIDEN_SKX-NEXT: vscatterdps %zmm0, (%rdi,%zmm1,4) {%k1} 497; WIDEN_SKX-NEXT: movw $1, %ax 498; WIDEN_SKX-NEXT: kmovw %eax, %k1 499; WIDEN_SKX-NEXT: vscatterdps %zmm2, (%rdi,%zmm3,4) {%k1} 500; WIDEN_SKX-NEXT: vzeroupper 501; WIDEN_SKX-NEXT: retq 502; 503; WIDEN_KNL-LABEL: test_mscatter_v17f32: 504; WIDEN_KNL: # %bb.0: 505; WIDEN_KNL-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3] 506; WIDEN_KNL-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3] 507; WIDEN_KNL-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0] 508; WIDEN_KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] 509; WIDEN_KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] 510; WIDEN_KNL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] 511; WIDEN_KNL-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 512; WIDEN_KNL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 513; WIDEN_KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] 514; WIDEN_KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3] 515; WIDEN_KNL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0] 516; WIDEN_KNL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 517; WIDEN_KNL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] 518; WIDEN_KNL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3] 519; WIDEN_KNL-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0] 520; WIDEN_KNL-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 521; WIDEN_KNL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 522; WIDEN_KNL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 523; WIDEN_KNL-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1 524; WIDEN_KNL-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1 525; WIDEN_KNL-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1 526; WIDEN_KNL-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero 527; WIDEN_KNL-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2 528; WIDEN_KNL-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2 529; WIDEN_KNL-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2 530; WIDEN_KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 531; WIDEN_KNL-NEXT: vmovd %esi, %xmm2 532; WIDEN_KNL-NEXT: vpinsrd $1, %edx, %xmm2, %xmm2 533; WIDEN_KNL-NEXT: vpinsrd $2, %ecx, %xmm2, %xmm2 534; WIDEN_KNL-NEXT: vpinsrd $3, %r8d, %xmm2, %xmm2 535; WIDEN_KNL-NEXT: vmovd %r9d, %xmm3 536; WIDEN_KNL-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm3, %xmm3 537; WIDEN_KNL-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm3, %xmm3 538; WIDEN_KNL-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm3, %xmm3 539; WIDEN_KNL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 540; WIDEN_KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 541; WIDEN_KNL-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 542; WIDEN_KNL-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero 543; WIDEN_KNL-NEXT: kxnorw %k0, %k0, %k1 544; WIDEN_KNL-NEXT: vscatterdps %zmm0, (%rdi,%zmm1,4) {%k1} 545; WIDEN_KNL-NEXT: movw $1, %ax 546; WIDEN_KNL-NEXT: kmovw %eax, %k1 547; WIDEN_KNL-NEXT: vscatterdps %zmm2, (%rdi,%zmm3,4) {%k1} 548; WIDEN_KNL-NEXT: vzeroupper 549; WIDEN_KNL-NEXT: retq 550; 551; WIDEN_AVX2-LABEL: test_mscatter_v17f32: 552; WIDEN_AVX2: # %bb.0: 553; WIDEN_AVX2-NEXT: vmovq %rdi, %xmm8 554; WIDEN_AVX2-NEXT: vpbroadcastq %xmm8, %ymm8 555; WIDEN_AVX2-NEXT: vmovd %esi, %xmm9 556; WIDEN_AVX2-NEXT: vpinsrd $1, %edx, %xmm9, %xmm9 557; WIDEN_AVX2-NEXT: vpinsrd $2, %ecx, %xmm9, %xmm9 558; WIDEN_AVX2-NEXT: vpinsrd $3, %r8d, %xmm9, %xmm9 559; WIDEN_AVX2-NEXT: vpmovsxdq %xmm9, %ymm9 560; WIDEN_AVX2-NEXT: vpsllq $2, %ymm9, %ymm9 561; WIDEN_AVX2-NEXT: vpaddq %ymm9, %ymm8, %ymm9 562; WIDEN_AVX2-NEXT: vmovq %xmm9, %rax 563; WIDEN_AVX2-NEXT: vmovss %xmm0, (%rax) 564; WIDEN_AVX2-NEXT: vpextrq $1, %xmm9, %rax 565; WIDEN_AVX2-NEXT: vmovss %xmm1, (%rax) 566; WIDEN_AVX2-NEXT: vextracti128 $1, %ymm9, %xmm0 567; WIDEN_AVX2-NEXT: vmovq %xmm0, %rax 568; WIDEN_AVX2-NEXT: vmovss %xmm2, (%rax) 569; WIDEN_AVX2-NEXT: vpextrq $1, %xmm0, %rax 570; WIDEN_AVX2-NEXT: vmovss %xmm3, (%rax) 571; WIDEN_AVX2-NEXT: vmovd %r9d, %xmm0 572; WIDEN_AVX2-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm0, %xmm0 573; WIDEN_AVX2-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0 574; WIDEN_AVX2-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0 575; WIDEN_AVX2-NEXT: vpmovsxdq %xmm0, %ymm0 576; WIDEN_AVX2-NEXT: vpsllq $2, %ymm0, %ymm0 577; WIDEN_AVX2-NEXT: vpaddq %ymm0, %ymm8, %ymm0 578; WIDEN_AVX2-NEXT: vmovq %xmm0, %rax 579; WIDEN_AVX2-NEXT: vmovss %xmm4, (%rax) 580; WIDEN_AVX2-NEXT: vpextrq $1, %xmm0, %rax 581; WIDEN_AVX2-NEXT: vmovss %xmm5, (%rax) 582; WIDEN_AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 583; WIDEN_AVX2-NEXT: vmovq %xmm0, %rax 584; WIDEN_AVX2-NEXT: vmovss %xmm6, (%rax) 585; WIDEN_AVX2-NEXT: vpextrq $1, %xmm0, %rax 586; WIDEN_AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 587; WIDEN_AVX2-NEXT: vmovss %xmm7, (%rax) 588; WIDEN_AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 589; WIDEN_AVX2-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1 590; WIDEN_AVX2-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1 591; WIDEN_AVX2-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1 592; WIDEN_AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 593; WIDEN_AVX2-NEXT: vpsllq $2, %ymm1, %ymm1 594; WIDEN_AVX2-NEXT: vpaddq %ymm1, %ymm8, %ymm1 595; WIDEN_AVX2-NEXT: vmovq %xmm1, %rax 596; WIDEN_AVX2-NEXT: vmovss %xmm0, (%rax) 597; WIDEN_AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 598; WIDEN_AVX2-NEXT: vpextrq $1, %xmm1, %rax 599; WIDEN_AVX2-NEXT: vmovss %xmm0, (%rax) 600; WIDEN_AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 601; WIDEN_AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 602; WIDEN_AVX2-NEXT: vmovq %xmm1, %rax 603; WIDEN_AVX2-NEXT: vmovss %xmm0, (%rax) 604; WIDEN_AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 605; WIDEN_AVX2-NEXT: vpextrq $1, %xmm1, %rax 606; WIDEN_AVX2-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 607; WIDEN_AVX2-NEXT: vmovss %xmm1, (%rax) 608; WIDEN_AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 609; WIDEN_AVX2-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1 610; WIDEN_AVX2-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1 611; WIDEN_AVX2-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1 612; WIDEN_AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 613; WIDEN_AVX2-NEXT: vpsllq $2, %ymm1, %ymm1 614; WIDEN_AVX2-NEXT: vpaddq %ymm1, %ymm8, %ymm1 615; WIDEN_AVX2-NEXT: vmovq %xmm1, %rax 616; WIDEN_AVX2-NEXT: vmovss %xmm0, (%rax) 617; WIDEN_AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 618; WIDEN_AVX2-NEXT: vpextrq $1, %xmm1, %rax 619; WIDEN_AVX2-NEXT: vmovss %xmm0, (%rax) 620; WIDEN_AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 621; WIDEN_AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 622; WIDEN_AVX2-NEXT: vmovq %xmm1, %rax 623; WIDEN_AVX2-NEXT: vmovss %xmm0, (%rax) 624; WIDEN_AVX2-NEXT: vpextrq $1, %xmm1, %rax 625; WIDEN_AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 626; WIDEN_AVX2-NEXT: vmovss %xmm0, (%rax) 627; WIDEN_AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 628; WIDEN_AVX2-NEXT: vpmovsxdq %xmm0, %xmm0 629; WIDEN_AVX2-NEXT: vpsllq $2, %xmm0, %xmm0 630; WIDEN_AVX2-NEXT: vpaddq %xmm0, %xmm8, %xmm0 631; WIDEN_AVX2-NEXT: vmovq %xmm0, %rax 632; WIDEN_AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 633; WIDEN_AVX2-NEXT: vmovss %xmm0, (%rax) 634; WIDEN_AVX2-NEXT: vzeroupper 635; WIDEN_AVX2-NEXT: retq 636{ 637 %gep = getelementptr float, ptr %base, <17 x i32> %index 638 call void @llvm.masked.scatter.v17f32.v17p0(<17 x float> %val, <17 x ptr> %gep, i32 4, <17 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 639 ret void 640} 641 642define <17 x float> @test_mgather_v17f32(ptr %base, <17 x i32> %index) 643; WIDEN_SKX-LABEL: test_mgather_v17f32: 644; WIDEN_SKX: # %bb.0: 645; WIDEN_SKX-NEXT: movq %rdi, %rax 646; WIDEN_SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 647; WIDEN_SKX-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm0, %xmm0 648; WIDEN_SKX-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0 649; WIDEN_SKX-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0 650; WIDEN_SKX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 651; WIDEN_SKX-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1 652; WIDEN_SKX-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1 653; WIDEN_SKX-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1 654; WIDEN_SKX-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 655; WIDEN_SKX-NEXT: vmovd %edx, %xmm1 656; WIDEN_SKX-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 657; WIDEN_SKX-NEXT: vpinsrd $2, %r8d, %xmm1, %xmm1 658; WIDEN_SKX-NEXT: vpinsrd $3, %r9d, %xmm1, %xmm1 659; WIDEN_SKX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero 660; WIDEN_SKX-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2 661; WIDEN_SKX-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2 662; WIDEN_SKX-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2 663; WIDEN_SKX-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 664; WIDEN_SKX-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 665; WIDEN_SKX-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 666; WIDEN_SKX-NEXT: kxnorw %k0, %k0, %k1 667; WIDEN_SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 668; WIDEN_SKX-NEXT: vxorps %xmm3, %xmm3, %xmm3 669; WIDEN_SKX-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm3 {%k1} 670; WIDEN_SKX-NEXT: movw $1, %cx 671; WIDEN_SKX-NEXT: kmovw %ecx, %k1 672; WIDEN_SKX-NEXT: vgatherdps (%rsi,%zmm1,4), %zmm2 {%k1} 673; WIDEN_SKX-NEXT: vmovss %xmm2, 64(%rdi) 674; WIDEN_SKX-NEXT: vmovaps %zmm3, (%rdi) 675; WIDEN_SKX-NEXT: vzeroupper 676; WIDEN_SKX-NEXT: retq 677; 678; WIDEN_KNL-LABEL: test_mgather_v17f32: 679; WIDEN_KNL: # %bb.0: 680; WIDEN_KNL-NEXT: movq %rdi, %rax 681; WIDEN_KNL-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 682; WIDEN_KNL-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm0, %xmm0 683; WIDEN_KNL-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0 684; WIDEN_KNL-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0 685; WIDEN_KNL-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 686; WIDEN_KNL-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1 687; WIDEN_KNL-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1 688; WIDEN_KNL-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1 689; WIDEN_KNL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 690; WIDEN_KNL-NEXT: vmovd %edx, %xmm1 691; WIDEN_KNL-NEXT: vpinsrd $1, %ecx, %xmm1, %xmm1 692; WIDEN_KNL-NEXT: vpinsrd $2, %r8d, %xmm1, %xmm1 693; WIDEN_KNL-NEXT: vpinsrd $3, %r9d, %xmm1, %xmm1 694; WIDEN_KNL-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero 695; WIDEN_KNL-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm2, %xmm2 696; WIDEN_KNL-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm2, %xmm2 697; WIDEN_KNL-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm2, %xmm2 698; WIDEN_KNL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 699; WIDEN_KNL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 700; WIDEN_KNL-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 701; WIDEN_KNL-NEXT: kxnorw %k0, %k0, %k1 702; WIDEN_KNL-NEXT: vpxor %xmm2, %xmm2, %xmm2 703; WIDEN_KNL-NEXT: vxorps %xmm3, %xmm3, %xmm3 704; WIDEN_KNL-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm3 {%k1} 705; WIDEN_KNL-NEXT: movw $1, %cx 706; WIDEN_KNL-NEXT: kmovw %ecx, %k1 707; WIDEN_KNL-NEXT: vgatherdps (%rsi,%zmm1,4), %zmm2 {%k1} 708; WIDEN_KNL-NEXT: vmovss %xmm2, 64(%rdi) 709; WIDEN_KNL-NEXT: vmovaps %zmm3, (%rdi) 710; WIDEN_KNL-NEXT: vzeroupper 711; WIDEN_KNL-NEXT: retq 712; 713; WIDEN_AVX2-LABEL: test_mgather_v17f32: 714; WIDEN_AVX2: # %bb.0: 715; WIDEN_AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 716; WIDEN_AVX2-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm0, %xmm0 717; WIDEN_AVX2-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm0, %xmm0 718; WIDEN_AVX2-NEXT: movq %rdi, %rax 719; WIDEN_AVX2-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm0, %xmm0 720; WIDEN_AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 721; WIDEN_AVX2-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1 722; WIDEN_AVX2-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 723; WIDEN_AVX2-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1 724; WIDEN_AVX2-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1 725; WIDEN_AVX2-NEXT: vmovd %edx, %xmm3 726; WIDEN_AVX2-NEXT: vpinsrd $1, %ecx, %xmm3, %xmm3 727; WIDEN_AVX2-NEXT: vpinsrd $2, %r8d, %xmm3, %xmm3 728; WIDEN_AVX2-NEXT: vpinsrd $3, %r9d, %xmm3, %xmm3 729; WIDEN_AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 730; WIDEN_AVX2-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 731; WIDEN_AVX2-NEXT: vpinsrd $1, {{[0-9]+}}(%rsp), %xmm1, %xmm1 732; WIDEN_AVX2-NEXT: vpinsrd $2, {{[0-9]+}}(%rsp), %xmm1, %xmm1 733; WIDEN_AVX2-NEXT: vpinsrd $3, {{[0-9]+}}(%rsp), %xmm1, %xmm1 734; WIDEN_AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 735; WIDEN_AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 736; WIDEN_AVX2-NEXT: vxorps %xmm4, %xmm4, %xmm4 737; WIDEN_AVX2-NEXT: vpcmpeqd %ymm5, %ymm5, %ymm5 738; WIDEN_AVX2-NEXT: vxorps %xmm6, %xmm6, %xmm6 739; WIDEN_AVX2-NEXT: vgatherdps %ymm5, (%rsi,%ymm1,4), %ymm6 740; WIDEN_AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1 741; WIDEN_AVX2-NEXT: vgatherdps %ymm3, (%rsi,%ymm0,4), %ymm1 742; WIDEN_AVX2-NEXT: vmovss {{.*#+}} xmm0 = [4294967295,0,0,0] 743; WIDEN_AVX2-NEXT: vgatherdps %ymm0, (%rsi,%ymm2,4), %ymm4 744; WIDEN_AVX2-NEXT: vmovss %xmm4, 64(%rdi) 745; WIDEN_AVX2-NEXT: vmovaps %ymm1, 32(%rdi) 746; WIDEN_AVX2-NEXT: vmovaps %ymm6, (%rdi) 747; WIDEN_AVX2-NEXT: vzeroupper 748; WIDEN_AVX2-NEXT: retq 749{ 750 %gep = getelementptr float, ptr %base, <17 x i32> %index 751 %res = call <17 x float> @llvm.masked.gather.v17f32.v17p0(<17 x ptr> %gep, i32 4, <17 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <17 x float> undef) 752 ret <17 x float> %res 753} 754 755declare <17 x float> @llvm.masked.gather.v17f32.v17p0(<17 x ptr>, i32 immarg, <17 x i1>, <17 x float>) 756declare void @llvm.masked.scatter.v17f32.v17p0(<17 x float> , <17 x ptr> , i32 , <17 x i1>) 757 758declare <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x double>) 759declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32, <2 x i1>) 760declare <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i32>) 761declare void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> , <2 x ptr> , i32 , <2 x i1>) 762