1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL_64 3; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL_32 4; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX --check-prefix=SKX_SMALL 5; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq -code-model=large < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX --check-prefix=SKX_LARGE 6; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX_32 7; RUN: opt -mtriple=x86_64-apple-darwin -passes=scalarize-masked-mem-intrin -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR 8; RUN: opt -mtriple=x86_64-apple-darwin -passes=scalarize-masked-mem-intrin -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR 9; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -mcpu=skx < %s -o /dev/null 10 11@glob_array = internal unnamed_addr constant [16 x i32] [i32 1, i32 1, i32 2, i32 3, i32 5, i32 8, i32 13, i32 21, i32 34, i32 55, i32 89, i32 144, i32 233, i32 377, i32 610, i32 987], align 16 12 13; SCALAR-LABEL: test1 14; SCALAR: extractelement <16 x ptr> 15; SCALAR-NEXT: load float 16; SCALAR-NEXT: insertelement <16 x float> 17; SCALAR-NEXT: extractelement <16 x ptr> 18; SCALAR-NEXT: load float 19 20define <16 x float> @test1(ptr %base, <16 x i32> %ind) { 21; KNL_64-LABEL: test1: 22; KNL_64: # %bb.0: 23; KNL_64-NEXT: kxnorw %k0, %k0, %k1 24; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1 25; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 26; KNL_64-NEXT: vmovaps %zmm1, %zmm0 27; KNL_64-NEXT: retq 28; 29; KNL_32-LABEL: test1: 30; KNL_32: # %bb.0: 31; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 32; KNL_32-NEXT: kxnorw %k0, %k0, %k1 33; KNL_32-NEXT: vxorps %xmm1, %xmm1, %xmm1 34; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} 35; KNL_32-NEXT: vmovaps %zmm1, %zmm0 36; KNL_32-NEXT: retl 37; 38; SKX-LABEL: test1: 39; SKX: # %bb.0: 40; SKX-NEXT: kxnorw %k0, %k0, %k1 41; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 42; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 43; SKX-NEXT: vmovaps %zmm1, %zmm0 44; SKX-NEXT: retq 45; 46; SKX_32-LABEL: test1: 47; SKX_32: # %bb.0: 48; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 49; SKX_32-NEXT: kxnorw %k0, %k0, %k1 50; SKX_32-NEXT: vxorps %xmm1, %xmm1, %xmm1 51; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} 52; SKX_32-NEXT: vmovaps %zmm1, %zmm0 53; SKX_32-NEXT: retl 54 55 %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 56 %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer 57 58 %sext_ind = sext <16 x i32> %ind to <16 x i64> 59 %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind 60 61 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef) 62 ret <16 x float>%res 63} 64 65declare <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x i32>) 66declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>) 67declare <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> , i32, <8 x i1> , <8 x i32> ) 68 69 70; SCALAR-LABEL: test2 71; SCALAR: extractelement <16 x ptr> 72; SCALAR-NEXT: load float 73; SCALAR-NEXT: insertelement <16 x float> 74; SCALAR-NEXT: br label %else 75; SCALAR: else: 76; SCALAR-NEXT: %res.phi.else = phi 77; SCALAR-NEXT: and i16 %{{.*}}, 2 78; SCALAR-NEXT: icmp ne i16 %{{.*}}, 0 79; SCALAR-NEXT: br i1 %{{.*}}, label %cond.load1, label %else2 80 81define <16 x float> @test2(ptr %base, <16 x i32> %ind, i16 %mask) { 82; KNL_64-LABEL: test2: 83; KNL_64: # %bb.0: 84; KNL_64-NEXT: kmovw %esi, %k1 85; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1 86; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 87; KNL_64-NEXT: vmovaps %zmm1, %zmm0 88; KNL_64-NEXT: retq 89; 90; KNL_32-LABEL: test2: 91; KNL_32: # %bb.0: 92; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 93; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 94; KNL_32-NEXT: vxorps %xmm1, %xmm1, %xmm1 95; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} 96; KNL_32-NEXT: vmovaps %zmm1, %zmm0 97; KNL_32-NEXT: retl 98; 99; SKX-LABEL: test2: 100; SKX: # %bb.0: 101; SKX-NEXT: kmovw %esi, %k1 102; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 103; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 104; SKX-NEXT: vmovaps %zmm1, %zmm0 105; SKX-NEXT: retq 106; 107; SKX_32-LABEL: test2: 108; SKX_32: # %bb.0: 109; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 110; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 111; SKX_32-NEXT: vxorps %xmm1, %xmm1, %xmm1 112; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} 113; SKX_32-NEXT: vmovaps %zmm1, %zmm0 114; SKX_32-NEXT: retl 115 116 %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 117 %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer 118 119 %sext_ind = sext <16 x i32> %ind to <16 x i64> 120 %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind 121 %imask = bitcast i16 %mask to <16 x i1> 122 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> %imask, <16 x float>undef) 123 ret <16 x float> %res 124} 125 126define <16 x i32> @test3(ptr %base, <16 x i32> %ind, i16 %mask) { 127; KNL_64-LABEL: test3: 128; KNL_64: # %bb.0: 129; KNL_64-NEXT: kmovw %esi, %k1 130; KNL_64-NEXT: vpxor %xmm1, %xmm1, %xmm1 131; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} 132; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm0 133; KNL_64-NEXT: retq 134; 135; KNL_32-LABEL: test3: 136; KNL_32: # %bb.0: 137; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 138; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 139; KNL_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 140; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1} 141; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0 142; KNL_32-NEXT: retl 143; 144; SKX-LABEL: test3: 145; SKX: # %bb.0: 146; SKX-NEXT: kmovw %esi, %k1 147; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 148; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1} 149; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 150; SKX-NEXT: retq 151; 152; SKX_32-LABEL: test3: 153; SKX_32: # %bb.0: 154; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 155; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 156; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 157; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k1} 158; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm0 159; SKX_32-NEXT: retl 160 161 %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 162 %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer 163 164 %sext_ind = sext <16 x i32> %ind to <16 x i64> 165 %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind 166 %imask = bitcast i16 %mask to <16 x i1> 167 %res = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef) 168 ret <16 x i32> %res 169} 170 171 172define <16 x i32> @test4(ptr %base, <16 x i32> %ind, i16 %mask) { 173; KNL_64-LABEL: test4: 174; KNL_64: # %bb.0: 175; KNL_64-NEXT: kmovw %esi, %k1 176; KNL_64-NEXT: vpxor %xmm1, %xmm1, %xmm1 177; KNL_64-NEXT: kmovw %k1, %k2 178; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2} 179; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm2 180; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1} 181; KNL_64-NEXT: vpaddd %zmm2, %zmm1, %zmm0 182; KNL_64-NEXT: retq 183; 184; KNL_32-LABEL: test4: 185; KNL_32: # %bb.0: 186; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 187; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 188; KNL_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 189; KNL_32-NEXT: kmovw %k1, %k2 190; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2} 191; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2 192; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1} 193; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0 194; KNL_32-NEXT: retl 195; 196; SKX-LABEL: test4: 197; SKX: # %bb.0: 198; SKX-NEXT: kmovw %esi, %k1 199; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 200; SKX-NEXT: kmovw %k1, %k2 201; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2} 202; SKX-NEXT: vmovdqa64 %zmm1, %zmm2 203; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1} 204; SKX-NEXT: vpaddd %zmm2, %zmm1, %zmm0 205; SKX-NEXT: retq 206; 207; SKX_32-LABEL: test4: 208; SKX_32: # %bb.0: 209; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 210; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 211; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 212; SKX_32-NEXT: kmovw %k1, %k2 213; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2} 214; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm2 215; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1} 216; SKX_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0 217; SKX_32-NEXT: retl 218 219 %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 220 %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer 221 222 %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind 223 %imask = bitcast i16 %mask to <16 x i1> 224 %gt1 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef) 225 %gt2 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1) 226 %res = add <16 x i32> %gt1, %gt2 227 ret <16 x i32> %res 228} 229 230 231; SCALAR-LABEL: test5 232; SCALAR: and i16 %scalar_mask, 1 233; SCALAR-NEXT: icmp ne i16 %{{.*}}, 0 234; SCALAR-NEXT: br i1 %{{.*}}, label %cond.store, label %else 235; SCALAR: cond.store: 236; SCALAR-NEXT: %Elt0 = extractelement <16 x i32> %val, i64 0 237; SCALAR-NEXT: %Ptr0 = extractelement <16 x ptr> %gep.random, i64 0 238; SCALAR-NEXT: store i32 %Elt0, ptr %Ptr0, align 4 239; SCALAR-NEXT: br label %else 240; SCALAR: else: 241; SCALAR-NEXT: and i16 %scalar_mask, 2 242; SCALAR-NEXT: icmp ne i16 %{{.*}}, 0 243; SCALAR-NEXT: br i1 %{{.*}}, label %cond.store1, label %else2 244 245define void @test5(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) { 246; KNL_64-LABEL: test5: 247; KNL_64: # %bb.0: 248; KNL_64-NEXT: kmovw %esi, %k1 249; KNL_64-NEXT: kmovw %k1, %k2 250; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2} 251; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1} 252; KNL_64-NEXT: vzeroupper 253; KNL_64-NEXT: retq 254; 255; KNL_32-LABEL: test5: 256; KNL_32: # %bb.0: 257; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 258; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 259; KNL_32-NEXT: kmovw %k1, %k2 260; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2} 261; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1} 262; KNL_32-NEXT: vzeroupper 263; KNL_32-NEXT: retl 264; 265; SKX-LABEL: test5: 266; SKX: # %bb.0: 267; SKX-NEXT: kmovw %esi, %k1 268; SKX-NEXT: kmovw %k1, %k2 269; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2} 270; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1} 271; SKX-NEXT: vzeroupper 272; SKX-NEXT: retq 273; 274; SKX_32-LABEL: test5: 275; SKX_32: # %bb.0: 276; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 277; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 278; SKX_32-NEXT: kmovw %k1, %k2 279; SKX_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2} 280; SKX_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1} 281; SKX_32-NEXT: vzeroupper 282; SKX_32-NEXT: retl 283 284 %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 285 %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer 286 287 %gep.random = getelementptr i32, <16 x ptr> %broadcast.splat, <16 x i32> %ind 288 %imask = bitcast i16 %mask to <16 x i1> 289 call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>%val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) 290 call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32>%val, <16 x ptr> %gep.random, i32 4, <16 x i1> %imask) 291 ret void 292} 293 294declare void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> , <8 x ptr> , i32 , <8 x i1> ) 295declare void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> , <16 x ptr> , i32 , <16 x i1> ) 296 297 298; SCALAR-LABEL: test6 299; SCALAR: store i32 %Elt0, ptr %Ptr01, align 4 300; SCALAR-NEXT: %Elt1 = extractelement <8 x i32> %a1, i64 1 301; SCALAR-NEXT: %Ptr12 = extractelement <8 x ptr> %ptr, i64 1 302; SCALAR-NEXT: store i32 %Elt1, ptr %Ptr12, align 4 303; SCALAR-NEXT: %Elt2 = extractelement <8 x i32> %a1, i64 2 304; SCALAR-NEXT: %Ptr23 = extractelement <8 x ptr> %ptr, i64 2 305; SCALAR-NEXT: store i32 %Elt2, ptr %Ptr23, align 4 306 307define <8 x i32> @test6(<8 x i32>%a1, <8 x ptr> %ptr) { 308; KNL_64-LABEL: test6: 309; KNL_64: # %bb.0: 310; KNL_64-NEXT: kxnorw %k0, %k0, %k1 311; KNL_64-NEXT: vpxor %xmm2, %xmm2, %xmm2 312; KNL_64-NEXT: kxnorw %k0, %k0, %k2 313; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} 314; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 315; KNL_64-NEXT: vmovdqa %ymm2, %ymm0 316; KNL_64-NEXT: retq 317; 318; KNL_32-LABEL: test6: 319; KNL_32: # %bb.0: 320; KNL_32-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1 321; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 322; KNL_32-NEXT: vpxor %xmm2, %xmm2, %xmm2 323; KNL_32-NEXT: movw $255, %ax 324; KNL_32-NEXT: kmovw %eax, %k1 325; KNL_32-NEXT: kmovw %k1, %k2 326; KNL_32-NEXT: vpgatherdd (,%zmm1), %zmm2 {%k2} 327; KNL_32-NEXT: vpscatterdd %zmm0, (,%zmm1) {%k1} 328; KNL_32-NEXT: vmovdqa %ymm2, %ymm0 329; KNL_32-NEXT: retl 330; 331; SKX-LABEL: test6: 332; SKX: # %bb.0: 333; SKX-NEXT: kxnorw %k0, %k0, %k1 334; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 335; SKX-NEXT: kxnorw %k0, %k0, %k2 336; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} 337; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1} 338; SKX-NEXT: vmovdqa %ymm2, %ymm0 339; SKX-NEXT: retq 340; 341; SKX_32-LABEL: test6: 342; SKX_32: # %bb.0: 343; SKX_32-NEXT: kxnorw %k0, %k0, %k1 344; SKX_32-NEXT: vpxor %xmm2, %xmm2, %xmm2 345; SKX_32-NEXT: kxnorw %k0, %k0, %k2 346; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm2 {%k2} 347; SKX_32-NEXT: vpscatterdd %ymm0, (,%ymm1) {%k1} 348; SKX_32-NEXT: vmovdqa %ymm2, %ymm0 349; SKX_32-NEXT: retl 350 351 %a = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef) 352 353 call void @llvm.masked.scatter.v8i32.v8p0(<8 x i32> %a1, <8 x ptr> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>) 354 ret <8 x i32>%a 355} 356 357define <8 x i32> @test7(ptr %base, <8 x i32> %ind, i8 %mask) { 358; 359; KNL_64-LABEL: test7: 360; KNL_64: # %bb.0: 361; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 362; KNL_64-NEXT: kmovw %esi, %k0 363; KNL_64-NEXT: kshiftlw $8, %k0, %k0 364; KNL_64-NEXT: kshiftrw $8, %k0, %k1 365; KNL_64-NEXT: vpxor %xmm1, %xmm1, %xmm1 366; KNL_64-NEXT: kmovw %k1, %k2 367; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2} 368; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm2 369; KNL_64-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1} 370; KNL_64-NEXT: vpaddd %ymm2, %ymm1, %ymm0 371; KNL_64-NEXT: retq 372; 373; KNL_32-LABEL: test7: 374; KNL_32: # %bb.0: 375; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 376; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 377; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx 378; KNL_32-NEXT: kmovw %ecx, %k0 379; KNL_32-NEXT: kshiftlw $8, %k0, %k0 380; KNL_32-NEXT: kshiftrw $8, %k0, %k1 381; KNL_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 382; KNL_32-NEXT: kmovw %k1, %k2 383; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2} 384; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2 385; KNL_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1} 386; KNL_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0 387; KNL_32-NEXT: retl 388; 389; SKX-LABEL: test7: 390; SKX: # %bb.0: 391; SKX-NEXT: kmovw %esi, %k1 392; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 393; SKX-NEXT: kmovw %k1, %k2 394; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm1 {%k2} 395; SKX-NEXT: vmovdqa %ymm1, %ymm2 396; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm2 {%k1} 397; SKX-NEXT: vpaddd %ymm2, %ymm1, %ymm0 398; SKX-NEXT: retq 399; 400; SKX_32-LABEL: test7: 401; SKX_32: # %bb.0: 402; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 403; SKX_32-NEXT: kmovb {{[0-9]+}}(%esp), %k1 404; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 405; SKX_32-NEXT: kmovw %k1, %k2 406; SKX_32-NEXT: vpgatherdd (%eax,%ymm0,4), %ymm1 {%k2} 407; SKX_32-NEXT: vmovdqa %ymm1, %ymm2 408; SKX_32-NEXT: vpgatherdd (%eax,%ymm0,4), %ymm2 {%k1} 409; SKX_32-NEXT: vpaddd %ymm2, %ymm1, %ymm0 410; SKX_32-NEXT: retl 411 412 %broadcast.splatinsert = insertelement <8 x ptr> undef, ptr %base, i32 0 413 %broadcast.splat = shufflevector <8 x ptr> %broadcast.splatinsert, <8 x ptr> undef, <8 x i32> zeroinitializer 414 415 %gep.random = getelementptr i32, <8 x ptr> %broadcast.splat, <8 x i32> %ind 416 %imask = bitcast i8 %mask to <8 x i1> 417 %gt1 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>undef) 418 %gt2 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>%gt1) 419 %res = add <8 x i32> %gt1, %gt2 420 ret <8 x i32> %res 421} 422 423; No uniform base in this case, index <8 x i64> contains addresses, 424; each gather call will be split into two 425define <16 x i32> @test8(<16 x ptr> %ptr.random, <16 x i32> %ind, i16 %mask) { 426; KNL_64-LABEL: test8: 427; KNL_64: # %bb.0: 428; KNL_64-NEXT: kmovw %edi, %k1 429; KNL_64-NEXT: kshiftrw $8, %k1, %k2 430; KNL_64-NEXT: vpxor %xmm2, %xmm2, %xmm2 431; KNL_64-NEXT: kmovw %k2, %k3 432; KNL_64-NEXT: vpxor %xmm3, %xmm3, %xmm3 433; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm3 {%k3} 434; KNL_64-NEXT: kmovw %k1, %k3 435; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm2 {%k3} 436; KNL_64-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm4 437; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm3 {%k2} 438; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm2 {%k1} 439; KNL_64-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 440; KNL_64-NEXT: vpaddd %zmm0, %zmm4, %zmm0 441; KNL_64-NEXT: retq 442; 443; KNL_32-LABEL: test8: 444; KNL_32: # %bb.0: 445; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 446; KNL_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 447; KNL_32-NEXT: kmovw %k1, %k2 448; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2} 449; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm2 450; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1} 451; KNL_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0 452; KNL_32-NEXT: retl 453; 454; SKX-LABEL: test8: 455; SKX: # %bb.0: 456; SKX-NEXT: kmovw %edi, %k1 457; SKX-NEXT: kshiftrw $8, %k1, %k2 458; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 459; SKX-NEXT: kmovw %k2, %k3 460; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 461; SKX-NEXT: vpgatherqd (,%zmm1), %ymm3 {%k3} 462; SKX-NEXT: kmovw %k1, %k3 463; SKX-NEXT: vpgatherqd (,%zmm0), %ymm2 {%k3} 464; SKX-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm4 465; SKX-NEXT: vpgatherqd (,%zmm1), %ymm3 {%k2} 466; SKX-NEXT: vpgatherqd (,%zmm0), %ymm2 {%k1} 467; SKX-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0 468; SKX-NEXT: vpaddd %zmm0, %zmm4, %zmm0 469; SKX-NEXT: retq 470; 471; SKX_32-LABEL: test8: 472; SKX_32: # %bb.0: 473; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 474; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 475; SKX_32-NEXT: kmovw %k1, %k2 476; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2} 477; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm2 478; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1} 479; SKX_32-NEXT: vpaddd %zmm2, %zmm1, %zmm0 480; SKX_32-NEXT: retl 481 482 %imask = bitcast i16 %mask to <16 x i1> 483 %gt1 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>undef) 484 %gt2 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1) 485 %res = add <16 x i32> %gt1, %gt2 486 ret <16 x i32> %res 487} 488 489%struct.RT = type { i8, [10 x [20 x i32]], i8 } 490%struct.ST = type { i32, double, %struct.RT } 491 492; Masked gather for aggregate types 493; Test9 and Test10 should give the same result (scalar and vector indices in GEP) 494 495 496define <8 x i32> @test9(ptr %base, <8 x i64> %ind1, <8 x i32>%ind5) { 497; KNL_64-LABEL: test9: 498; KNL_64: # %bb.0: # %entry 499; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2 500; KNL_64-NEXT: vpbroadcastq {{.*#+}} zmm3 = [824,824,824,824,824,824,824,824] 501; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4 502; KNL_64-NEXT: vpaddq %zmm4, %zmm2, %zmm2 503; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0 504; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0 505; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0 506; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 507; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero 508; KNL_64-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1 509; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm1 510; KNL_64-NEXT: kxnorw %k0, %k0, %k1 511; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0 512; KNL_64-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1} 513; KNL_64-NEXT: retq 514; 515; KNL_32-LABEL: test9: 516; KNL_32: # %bb.0: # %entry 517; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2 518; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [80,80,80,80,80,80,80,80] 519; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1 520; KNL_32-NEXT: vpmovqd %zmm0, %ymm0 521; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [820,820,820,820,820,820,820,820] 522; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0 523; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0 524; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm1 525; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 526; KNL_32-NEXT: movw $255, %ax 527; KNL_32-NEXT: kmovw %eax, %k1 528; KNL_32-NEXT: vpgatherdd 68(,%zmm1), %zmm0 {%k1} 529; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 530; KNL_32-NEXT: retl 531; 532; SKX_SMALL-LABEL: test9: 533; SKX_SMALL: # %bb.0: # %entry 534; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2 535; SKX_SMALL-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 536; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0 537; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero 538; SKX_SMALL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1 539; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm1 540; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1 541; SKX_SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0 542; SKX_SMALL-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1} 543; SKX_SMALL-NEXT: retq 544; 545; SKX_LARGE-LABEL: test9: 546; SKX_LARGE: # %bb.0: # %entry 547; SKX_LARGE-NEXT: vpbroadcastq %rdi, %zmm2 548; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero 549; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax 550; SKX_LARGE-NEXT: vpmuldq (%rax){1to8}, %zmm1, %zmm1 551; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax 552; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0 553; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0 554; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm1 555; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1 556; SKX_LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0 557; SKX_LARGE-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1} 558; SKX_LARGE-NEXT: retq 559; 560; SKX_32-LABEL: test9: 561; SKX_32: # %bb.0: # %entry 562; SKX_32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm1, %ymm1 563; SKX_32-NEXT: vpmovqd %zmm0, %ymm0 564; SKX_32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0 565; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0 566; SKX_32-NEXT: vpaddd %ymm1, %ymm0, %ymm1 567; SKX_32-NEXT: kxnorw %k0, %k0, %k1 568; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 569; SKX_32-NEXT: vpgatherdd 68(,%ymm1), %ymm0 {%k1} 570; SKX_32-NEXT: retl 571entry: 572 %broadcast.splatinsert = insertelement <8 x ptr> undef, ptr %base, i32 0 573 %broadcast.splat = shufflevector <8 x ptr> %broadcast.splatinsert, <8 x ptr> undef, <8 x i32> zeroinitializer 574 575 %arrayidx = getelementptr %struct.ST, <8 x ptr> %broadcast.splat, <8 x i64> %ind1, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>, <8 x i32><i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> %ind5, <8 x i64> <i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13> 576 %res = call <8 x i32 > @llvm.masked.gather.v8i32.v8p0(<8 x ptr>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef) 577 ret <8 x i32> %res 578} 579 580define <8 x i32> @test10(ptr %base, <8 x i64> %i1, <8 x i32>%ind5) { 581; KNL_64-LABEL: test10: 582; KNL_64: # %bb.0: # %entry 583; KNL_64-NEXT: vpbroadcastq %rdi, %zmm2 584; KNL_64-NEXT: vpbroadcastq {{.*#+}} zmm3 = [824,824,824,824,824,824,824,824] 585; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm4 586; KNL_64-NEXT: vpaddq %zmm4, %zmm2, %zmm2 587; KNL_64-NEXT: vpsrlq $32, %zmm0, %zmm0 588; KNL_64-NEXT: vpmuludq %zmm3, %zmm0, %zmm0 589; KNL_64-NEXT: vpsllq $32, %zmm0, %zmm0 590; KNL_64-NEXT: vpaddq %zmm0, %zmm2, %zmm0 591; KNL_64-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero 592; KNL_64-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1 593; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm1 594; KNL_64-NEXT: kxnorw %k0, %k0, %k1 595; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0 596; KNL_64-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1} 597; KNL_64-NEXT: retq 598; 599; KNL_32-LABEL: test10: 600; KNL_32: # %bb.0: # %entry 601; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %ymm2 602; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [80,80,80,80,80,80,80,80] 603; KNL_32-NEXT: vpmulld %ymm3, %ymm1, %ymm1 604; KNL_32-NEXT: vpmovqd %zmm0, %ymm0 605; KNL_32-NEXT: vpbroadcastd {{.*#+}} ymm3 = [820,820,820,820,820,820,820,820] 606; KNL_32-NEXT: vpmulld %ymm3, %ymm0, %ymm0 607; KNL_32-NEXT: vpaddd %ymm0, %ymm2, %ymm0 608; KNL_32-NEXT: vpaddd %ymm1, %ymm0, %ymm1 609; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 610; KNL_32-NEXT: movw $255, %ax 611; KNL_32-NEXT: kmovw %eax, %k1 612; KNL_32-NEXT: vpgatherdd 68(,%zmm1), %zmm0 {%k1} 613; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 614; KNL_32-NEXT: retl 615; 616; SKX_SMALL-LABEL: test10: 617; SKX_SMALL: # %bb.0: # %entry 618; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2 619; SKX_SMALL-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0 620; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0 621; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero 622; SKX_SMALL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1 623; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm1 624; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1 625; SKX_SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0 626; SKX_SMALL-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1} 627; SKX_SMALL-NEXT: retq 628; 629; SKX_LARGE-LABEL: test10: 630; SKX_LARGE: # %bb.0: # %entry 631; SKX_LARGE-NEXT: vpbroadcastq %rdi, %zmm2 632; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero 633; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax 634; SKX_LARGE-NEXT: vpmuldq (%rax){1to8}, %zmm1, %zmm1 635; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax 636; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0 637; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0 638; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm1 639; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1 640; SKX_LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0 641; SKX_LARGE-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1} 642; SKX_LARGE-NEXT: retq 643; 644; SKX_32-LABEL: test10: 645; SKX_32: # %bb.0: # %entry 646; SKX_32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm1, %ymm1 647; SKX_32-NEXT: vpmovqd %zmm0, %ymm0 648; SKX_32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0 649; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0 650; SKX_32-NEXT: vpaddd %ymm1, %ymm0, %ymm1 651; SKX_32-NEXT: kxnorw %k0, %k0, %k1 652; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 653; SKX_32-NEXT: vpgatherdd 68(,%ymm1), %ymm0 {%k1} 654; SKX_32-NEXT: retl 655entry: 656 %broadcast.splatinsert = insertelement <8 x ptr> undef, ptr %base, i32 0 657 %broadcast.splat = shufflevector <8 x ptr> %broadcast.splatinsert, <8 x ptr> undef, <8 x i32> zeroinitializer 658 659 %arrayidx = getelementptr %struct.ST, <8 x ptr> %broadcast.splat, <8 x i64> %i1, i32 2, i32 1, <8 x i32> %ind5, i64 13 660 %res = call <8 x i32 > @llvm.masked.gather.v8i32.v8p0(<8 x ptr>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef) 661 ret <8 x i32> %res 662} 663 664; Splat index in GEP, requires broadcast 665define <16 x float> @test11(ptr %base, i32 %ind) { 666; KNL_64-LABEL: test11: 667; KNL_64: # %bb.0: 668; KNL_64-NEXT: movslq %esi, %rax 669; KNL_64-NEXT: leaq (%rdi,%rax,4), %rax 670; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1 671; KNL_64-NEXT: kxnorw %k0, %k0, %k1 672; KNL_64-NEXT: vxorps %xmm0, %xmm0, %xmm0 673; KNL_64-NEXT: vgatherdps (%rax,%zmm1,4), %zmm0 {%k1} 674; KNL_64-NEXT: retq 675; 676; KNL_32-LABEL: test11: 677; KNL_32: # %bb.0: 678; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 679; KNL_32-NEXT: shll $2, %eax 680; KNL_32-NEXT: addl {{[0-9]+}}(%esp), %eax 681; KNL_32-NEXT: vxorps %xmm1, %xmm1, %xmm1 682; KNL_32-NEXT: kxnorw %k0, %k0, %k1 683; KNL_32-NEXT: vxorps %xmm0, %xmm0, %xmm0 684; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} 685; KNL_32-NEXT: retl 686; 687; SKX-LABEL: test11: 688; SKX: # %bb.0: 689; SKX-NEXT: movslq %esi, %rax 690; SKX-NEXT: leaq (%rdi,%rax,4), %rax 691; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 692; SKX-NEXT: kxnorw %k0, %k0, %k1 693; SKX-NEXT: vxorps %xmm0, %xmm0, %xmm0 694; SKX-NEXT: vgatherdps (%rax,%zmm1,4), %zmm0 {%k1} 695; SKX-NEXT: retq 696; 697; SKX_32-LABEL: test11: 698; SKX_32: # %bb.0: 699; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 700; SKX_32-NEXT: shll $2, %eax 701; SKX_32-NEXT: addl {{[0-9]+}}(%esp), %eax 702; SKX_32-NEXT: vxorps %xmm1, %xmm1, %xmm1 703; SKX_32-NEXT: kxnorw %k0, %k0, %k1 704; SKX_32-NEXT: vxorps %xmm0, %xmm0, %xmm0 705; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} 706; SKX_32-NEXT: retl 707 708 %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 709 %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer 710 711 %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, i32 %ind 712 713 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef) 714 ret <16 x float>%res 715} 716 717; We are checking the uniform base here. It is taken directly from input to vgatherdps 718define <16 x float> @test12(ptr %base, <16 x i32> %ind) { 719; KNL_64-LABEL: test12: 720; KNL_64: # %bb.0: 721; KNL_64-NEXT: kxnorw %k0, %k0, %k1 722; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1 723; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 724; KNL_64-NEXT: vmovaps %zmm1, %zmm0 725; KNL_64-NEXT: retq 726; 727; KNL_32-LABEL: test12: 728; KNL_32: # %bb.0: 729; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 730; KNL_32-NEXT: kxnorw %k0, %k0, %k1 731; KNL_32-NEXT: vxorps %xmm1, %xmm1, %xmm1 732; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} 733; KNL_32-NEXT: vmovaps %zmm1, %zmm0 734; KNL_32-NEXT: retl 735; 736; SKX-LABEL: test12: 737; SKX: # %bb.0: 738; SKX-NEXT: kxnorw %k0, %k0, %k1 739; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 740; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 741; SKX-NEXT: vmovaps %zmm1, %zmm0 742; SKX-NEXT: retq 743; 744; SKX_32-LABEL: test12: 745; SKX_32: # %bb.0: 746; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 747; SKX_32-NEXT: kxnorw %k0, %k0, %k1 748; SKX_32-NEXT: vxorps %xmm1, %xmm1, %xmm1 749; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} 750; SKX_32-NEXT: vmovaps %zmm1, %zmm0 751; SKX_32-NEXT: retl 752 753 %sext_ind = sext <16 x i32> %ind to <16 x i64> 754 %gep.random = getelementptr float, ptr%base, <16 x i64> %sext_ind 755 756 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef) 757 ret <16 x float>%res 758} 759 760; The same as the previous, but the mask is undefined 761define <16 x float> @test13(ptr %base, <16 x i32> %ind) { 762; KNL_64-LABEL: test13: 763; KNL_64: # %bb.0: 764; KNL_64-NEXT: kxnorw %k0, %k0, %k1 765; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1 766; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 767; KNL_64-NEXT: vmovaps %zmm1, %zmm0 768; KNL_64-NEXT: retq 769; 770; KNL_32-LABEL: test13: 771; KNL_32: # %bb.0: 772; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 773; KNL_32-NEXT: kxnorw %k0, %k0, %k1 774; KNL_32-NEXT: vxorps %xmm1, %xmm1, %xmm1 775; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} 776; KNL_32-NEXT: vmovaps %zmm1, %zmm0 777; KNL_32-NEXT: retl 778; 779; SKX-LABEL: test13: 780; SKX: # %bb.0: 781; SKX-NEXT: kxnorw %k0, %k0, %k1 782; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 783; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 784; SKX-NEXT: vmovaps %zmm1, %zmm0 785; SKX-NEXT: retq 786; 787; SKX_32-LABEL: test13: 788; SKX_32: # %bb.0: 789; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 790; SKX_32-NEXT: kxnorw %k0, %k0, %k1 791; SKX_32-NEXT: vxorps %xmm1, %xmm1, %xmm1 792; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} 793; SKX_32-NEXT: vmovaps %zmm1, %zmm0 794; SKX_32-NEXT: retl 795 796 %sext_ind = sext <16 x i32> %ind to <16 x i64> 797 %gep.random = getelementptr float, ptr%base, <16 x i64> %sext_ind 798 799 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef) 800 ret <16 x float>%res 801} 802 803; The base pointer is not splat, can't find unform base 804define <16 x float> @test14(ptr %base, i32 %ind, <16 x ptr> %vec) { 805; KNL_64-LABEL: test14: 806; KNL_64: # %bb.0: 807; KNL_64-NEXT: vmovq %xmm0, %rax 808; KNL_64-NEXT: vmovd %esi, %xmm0 809; KNL_64-NEXT: vpbroadcastd %xmm0, %ymm0 810; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0 811; KNL_64-NEXT: kxnorw %k0, %k0, %k1 812; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1 813; KNL_64-NEXT: vgatherqps (%rax,%zmm0,4), %ymm1 {%k1} 814; KNL_64-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm0 815; KNL_64-NEXT: retq 816; 817; KNL_32-LABEL: test14: 818; KNL_32: # %bb.0: 819; KNL_32-NEXT: vmovd %xmm0, %eax 820; KNL_32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %zmm1 821; KNL_32-NEXT: kxnorw %k0, %k0, %k1 822; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 823; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} 824; KNL_32-NEXT: retl 825; 826; SKX-LABEL: test14: 827; SKX: # %bb.0: 828; SKX-NEXT: vmovq %xmm0, %rax 829; SKX-NEXT: vpbroadcastd %esi, %ymm0 830; SKX-NEXT: vpmovsxdq %ymm0, %zmm0 831; SKX-NEXT: kxnorw %k0, %k0, %k1 832; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 833; SKX-NEXT: vgatherqps (%rax,%zmm0,4), %ymm1 {%k1} 834; SKX-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm0 835; SKX-NEXT: retq 836; 837; SKX_32-LABEL: test14: 838; SKX_32: # %bb.0: 839; SKX_32-NEXT: vmovd %xmm0, %eax 840; SKX_32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %zmm1 841; SKX_32-NEXT: kxnorw %k0, %k0, %k1 842; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 843; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} 844; SKX_32-NEXT: retl 845 846 %broadcast.splatinsert = insertelement <16 x ptr> %vec, ptr %base, i32 1 847 %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer 848 849 %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, i32 %ind 850 851 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef) 852 ret <16 x float>%res 853} 854 855declare <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x float>) 856declare <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x double>) 857declare <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x double>) 858 859; Gather smaller than existing instruction 860define <4 x float> @test15(ptr %base, <4 x i32> %ind, <4 x i1> %mask) { 861; KNL_64-LABEL: test15: 862; KNL_64: # %bb.0: 863; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1 864; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0 865; KNL_64-NEXT: vpmovsxdq %xmm0, %ymm0 866; KNL_64-NEXT: vpsllq $2, %ymm0, %ymm0 867; KNL_64-NEXT: vmovq %rdi, %xmm1 868; KNL_64-NEXT: vpbroadcastq %xmm1, %ymm1 869; KNL_64-NEXT: vpaddq %ymm0, %ymm1, %ymm1 870; KNL_64-NEXT: kmovw %k0, %eax 871; KNL_64-NEXT: testb $1, %al 872; KNL_64-NEXT: # implicit-def: $xmm0 873; KNL_64-NEXT: je .LBB14_2 874; KNL_64-NEXT: # %bb.1: # %cond.load 875; KNL_64-NEXT: vmovq %xmm1, %rcx 876; KNL_64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 877; KNL_64-NEXT: .LBB14_2: # %else 878; KNL_64-NEXT: testb $2, %al 879; KNL_64-NEXT: je .LBB14_4 880; KNL_64-NEXT: # %bb.3: # %cond.load1 881; KNL_64-NEXT: vpextrq $1, %xmm1, %rcx 882; KNL_64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] 883; KNL_64-NEXT: .LBB14_4: # %else2 884; KNL_64-NEXT: testb $4, %al 885; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm1 886; KNL_64-NEXT: jne .LBB14_5 887; KNL_64-NEXT: # %bb.6: # %else5 888; KNL_64-NEXT: testb $8, %al 889; KNL_64-NEXT: jne .LBB14_7 890; KNL_64-NEXT: .LBB14_8: # %else8 891; KNL_64-NEXT: vzeroupper 892; KNL_64-NEXT: retq 893; KNL_64-NEXT: .LBB14_5: # %cond.load4 894; KNL_64-NEXT: vmovq %xmm1, %rcx 895; KNL_64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 896; KNL_64-NEXT: testb $8, %al 897; KNL_64-NEXT: je .LBB14_8 898; KNL_64-NEXT: .LBB14_7: # %cond.load7 899; KNL_64-NEXT: vpextrq $1, %xmm1, %rax 900; KNL_64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 901; KNL_64-NEXT: vzeroupper 902; KNL_64-NEXT: retq 903; 904; KNL_32-LABEL: test15: 905; KNL_32: # %bb.0: 906; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 907; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0 908; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 909; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 910; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm1 911; KNL_32-NEXT: kmovw %k0, %eax 912; KNL_32-NEXT: testb $1, %al 913; KNL_32-NEXT: # implicit-def: $xmm0 914; KNL_32-NEXT: jne .LBB14_1 915; KNL_32-NEXT: # %bb.2: # %else 916; KNL_32-NEXT: testb $2, %al 917; KNL_32-NEXT: jne .LBB14_3 918; KNL_32-NEXT: .LBB14_4: # %else2 919; KNL_32-NEXT: testb $4, %al 920; KNL_32-NEXT: jne .LBB14_5 921; KNL_32-NEXT: .LBB14_6: # %else5 922; KNL_32-NEXT: testb $8, %al 923; KNL_32-NEXT: jne .LBB14_7 924; KNL_32-NEXT: .LBB14_8: # %else8 925; KNL_32-NEXT: vzeroupper 926; KNL_32-NEXT: retl 927; KNL_32-NEXT: .LBB14_1: # %cond.load 928; KNL_32-NEXT: vmovd %xmm1, %ecx 929; KNL_32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 930; KNL_32-NEXT: testb $2, %al 931; KNL_32-NEXT: je .LBB14_4 932; KNL_32-NEXT: .LBB14_3: # %cond.load1 933; KNL_32-NEXT: vpextrd $1, %xmm1, %ecx 934; KNL_32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] 935; KNL_32-NEXT: testb $4, %al 936; KNL_32-NEXT: je .LBB14_6 937; KNL_32-NEXT: .LBB14_5: # %cond.load4 938; KNL_32-NEXT: vpextrd $2, %xmm1, %ecx 939; KNL_32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 940; KNL_32-NEXT: testb $8, %al 941; KNL_32-NEXT: je .LBB14_8 942; KNL_32-NEXT: .LBB14_7: # %cond.load7 943; KNL_32-NEXT: vpextrd $3, %xmm1, %eax 944; KNL_32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 945; KNL_32-NEXT: vzeroupper 946; KNL_32-NEXT: retl 947; 948; SKX-LABEL: test15: 949; SKX: # %bb.0: 950; SKX-NEXT: vpslld $31, %xmm1, %xmm1 951; SKX-NEXT: vpmovd2m %xmm1, %k1 952; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 953; SKX-NEXT: vgatherdps (%rdi,%xmm0,4), %xmm1 {%k1} 954; SKX-NEXT: vmovaps %xmm1, %xmm0 955; SKX-NEXT: retq 956; 957; SKX_32-LABEL: test15: 958; SKX_32: # %bb.0: 959; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1 960; SKX_32-NEXT: vpmovd2m %xmm1, %k1 961; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 962; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 963; SKX_32-NEXT: vgatherdps (%eax,%xmm0,4), %xmm1 {%k1} 964; SKX_32-NEXT: vmovaps %xmm1, %xmm0 965; SKX_32-NEXT: retl 966 967 %sext_ind = sext <4 x i32> %ind to <4 x i64> 968 %gep.random = getelementptr float, ptr %base, <4 x i64> %sext_ind 969 %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> %gep.random, i32 4, <4 x i1> %mask, <4 x float> undef) 970 ret <4 x float>%res 971} 972 973; Gather smaller than existing instruction 974define <4 x double> @test16(ptr %base, <4 x i32> %ind, <4 x i1> %mask, <4 x double> %src0) { 975; KNL_64-LABEL: test16: 976; KNL_64: # %bb.0: 977; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1 978; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0 979; KNL_64-NEXT: vpmovsxdq %xmm0, %ymm0 980; KNL_64-NEXT: vpsllq $3, %ymm0, %ymm0 981; KNL_64-NEXT: vmovq %rdi, %xmm1 982; KNL_64-NEXT: vpbroadcastq %xmm1, %ymm1 983; KNL_64-NEXT: vpaddq %ymm0, %ymm1, %ymm0 984; KNL_64-NEXT: kmovw %k0, %eax 985; KNL_64-NEXT: testb $1, %al 986; KNL_64-NEXT: je .LBB15_2 987; KNL_64-NEXT: # %bb.1: # %cond.load 988; KNL_64-NEXT: vmovq %xmm0, %rcx 989; KNL_64-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 990; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3,4,5,6,7] 991; KNL_64-NEXT: .LBB15_2: # %else 992; KNL_64-NEXT: testb $2, %al 993; KNL_64-NEXT: je .LBB15_4 994; KNL_64-NEXT: # %bb.3: # %cond.load1 995; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx 996; KNL_64-NEXT: vmovhps {{.*#+}} xmm1 = xmm2[0,1],mem[0,1] 997; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] 998; KNL_64-NEXT: .LBB15_4: # %else2 999; KNL_64-NEXT: testb $4, %al 1000; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm0 1001; KNL_64-NEXT: jne .LBB15_5 1002; KNL_64-NEXT: # %bb.6: # %else5 1003; KNL_64-NEXT: testb $8, %al 1004; KNL_64-NEXT: jne .LBB15_7 1005; KNL_64-NEXT: .LBB15_8: # %else8 1006; KNL_64-NEXT: vmovdqa %ymm2, %ymm0 1007; KNL_64-NEXT: retq 1008; KNL_64-NEXT: .LBB15_5: # %cond.load4 1009; KNL_64-NEXT: vmovq %xmm0, %rcx 1010; KNL_64-NEXT: vpbroadcastq (%rcx), %ymm1 1011; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] 1012; KNL_64-NEXT: testb $8, %al 1013; KNL_64-NEXT: je .LBB15_8 1014; KNL_64-NEXT: .LBB15_7: # %cond.load7 1015; KNL_64-NEXT: vpextrq $1, %xmm0, %rax 1016; KNL_64-NEXT: vpbroadcastq (%rax), %ymm0 1017; KNL_64-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7] 1018; KNL_64-NEXT: vmovdqa %ymm2, %ymm0 1019; KNL_64-NEXT: retq 1020; 1021; KNL_32-LABEL: test16: 1022; KNL_32: # %bb.0: 1023; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 1024; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0 1025; KNL_32-NEXT: vpslld $3, %xmm0, %xmm0 1026; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 1027; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 1028; KNL_32-NEXT: kmovw %k0, %eax 1029; KNL_32-NEXT: testb $1, %al 1030; KNL_32-NEXT: jne .LBB15_1 1031; KNL_32-NEXT: # %bb.2: # %else 1032; KNL_32-NEXT: testb $2, %al 1033; KNL_32-NEXT: jne .LBB15_3 1034; KNL_32-NEXT: .LBB15_4: # %else2 1035; KNL_32-NEXT: testb $4, %al 1036; KNL_32-NEXT: jne .LBB15_5 1037; KNL_32-NEXT: .LBB15_6: # %else5 1038; KNL_32-NEXT: testb $8, %al 1039; KNL_32-NEXT: jne .LBB15_7 1040; KNL_32-NEXT: .LBB15_8: # %else8 1041; KNL_32-NEXT: vmovdqa %ymm2, %ymm0 1042; KNL_32-NEXT: retl 1043; KNL_32-NEXT: .LBB15_1: # %cond.load 1044; KNL_32-NEXT: vmovd %xmm0, %ecx 1045; KNL_32-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 1046; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1],ymm2[2,3,4,5,6,7] 1047; KNL_32-NEXT: testb $2, %al 1048; KNL_32-NEXT: je .LBB15_4 1049; KNL_32-NEXT: .LBB15_3: # %cond.load1 1050; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx 1051; KNL_32-NEXT: vmovhps {{.*#+}} xmm1 = xmm2[0,1],mem[0,1] 1052; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] 1053; KNL_32-NEXT: testb $4, %al 1054; KNL_32-NEXT: je .LBB15_6 1055; KNL_32-NEXT: .LBB15_5: # %cond.load4 1056; KNL_32-NEXT: vpextrd $2, %xmm0, %ecx 1057; KNL_32-NEXT: vpbroadcastq (%ecx), %ymm1 1058; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7] 1059; KNL_32-NEXT: testb $8, %al 1060; KNL_32-NEXT: je .LBB15_8 1061; KNL_32-NEXT: .LBB15_7: # %cond.load7 1062; KNL_32-NEXT: vpextrd $3, %xmm0, %eax 1063; KNL_32-NEXT: vpbroadcastq (%eax), %ymm0 1064; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm0[6,7] 1065; KNL_32-NEXT: vmovdqa %ymm2, %ymm0 1066; KNL_32-NEXT: retl 1067; 1068; SKX-LABEL: test16: 1069; SKX: # %bb.0: 1070; SKX-NEXT: vpslld $31, %xmm1, %xmm1 1071; SKX-NEXT: vpmovd2m %xmm1, %k1 1072; SKX-NEXT: vgatherdpd (%rdi,%xmm0,8), %ymm2 {%k1} 1073; SKX-NEXT: vmovapd %ymm2, %ymm0 1074; SKX-NEXT: retq 1075; 1076; SKX_32-LABEL: test16: 1077; SKX_32: # %bb.0: 1078; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1 1079; SKX_32-NEXT: vpmovd2m %xmm1, %k1 1080; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1081; SKX_32-NEXT: vgatherdpd (%eax,%xmm0,8), %ymm2 {%k1} 1082; SKX_32-NEXT: vmovapd %ymm2, %ymm0 1083; SKX_32-NEXT: retl 1084 1085 %sext_ind = sext <4 x i32> %ind to <4 x i64> 1086 %gep.random = getelementptr double, ptr %base, <4 x i64> %sext_ind 1087 %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %gep.random, i32 4, <4 x i1> %mask, <4 x double> %src0) 1088 ret <4 x double>%res 1089} 1090 1091define <2 x double> @test17(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x double> %src0) { 1092; KNL_64-LABEL: test17: 1093; KNL_64: # %bb.0: 1094; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 1095; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 1096; KNL_64-NEXT: vpmovsxdq %xmm0, %xmm0 1097; KNL_64-NEXT: vpsllq $3, %xmm0, %xmm0 1098; KNL_64-NEXT: vmovq %rdi, %xmm1 1099; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 1100; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 1101; KNL_64-NEXT: kmovw %k0, %eax 1102; KNL_64-NEXT: testb $1, %al 1103; KNL_64-NEXT: jne .LBB16_1 1104; KNL_64-NEXT: # %bb.2: # %else 1105; KNL_64-NEXT: testb $2, %al 1106; KNL_64-NEXT: jne .LBB16_3 1107; KNL_64-NEXT: .LBB16_4: # %else2 1108; KNL_64-NEXT: vmovaps %xmm2, %xmm0 1109; KNL_64-NEXT: vzeroupper 1110; KNL_64-NEXT: retq 1111; KNL_64-NEXT: .LBB16_1: # %cond.load 1112; KNL_64-NEXT: vmovq %xmm0, %rcx 1113; KNL_64-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] 1114; KNL_64-NEXT: testb $2, %al 1115; KNL_64-NEXT: je .LBB16_4 1116; KNL_64-NEXT: .LBB16_3: # %cond.load1 1117; KNL_64-NEXT: vpextrq $1, %xmm0, %rax 1118; KNL_64-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] 1119; KNL_64-NEXT: vmovaps %xmm2, %xmm0 1120; KNL_64-NEXT: vzeroupper 1121; KNL_64-NEXT: retq 1122; 1123; KNL_32-LABEL: test17: 1124; KNL_32: # %bb.0: 1125; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 1126; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 1127; KNL_32-NEXT: vpslld $3, %xmm0, %xmm0 1128; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 1129; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 1130; KNL_32-NEXT: kmovw %k0, %eax 1131; KNL_32-NEXT: testb $1, %al 1132; KNL_32-NEXT: jne .LBB16_1 1133; KNL_32-NEXT: # %bb.2: # %else 1134; KNL_32-NEXT: testb $2, %al 1135; KNL_32-NEXT: jne .LBB16_3 1136; KNL_32-NEXT: .LBB16_4: # %else2 1137; KNL_32-NEXT: vmovaps %xmm2, %xmm0 1138; KNL_32-NEXT: vzeroupper 1139; KNL_32-NEXT: retl 1140; KNL_32-NEXT: .LBB16_1: # %cond.load 1141; KNL_32-NEXT: vmovd %xmm0, %ecx 1142; KNL_32-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] 1143; KNL_32-NEXT: testb $2, %al 1144; KNL_32-NEXT: je .LBB16_4 1145; KNL_32-NEXT: .LBB16_3: # %cond.load1 1146; KNL_32-NEXT: vpextrd $1, %xmm0, %eax 1147; KNL_32-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] 1148; KNL_32-NEXT: vmovaps %xmm2, %xmm0 1149; KNL_32-NEXT: vzeroupper 1150; KNL_32-NEXT: retl 1151; 1152; SKX-LABEL: test17: 1153; SKX: # %bb.0: 1154; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 1155; SKX-NEXT: vpmovq2m %xmm1, %k0 1156; SKX-NEXT: vpmovsxdq %xmm0, %xmm0 1157; SKX-NEXT: vpbroadcastq %rdi, %xmm1 1158; SKX-NEXT: vpsllq $3, %xmm0, %xmm0 1159; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 1160; SKX-NEXT: kmovw %k0, %eax 1161; SKX-NEXT: testb $1, %al 1162; SKX-NEXT: jne .LBB16_1 1163; SKX-NEXT: # %bb.2: # %else 1164; SKX-NEXT: testb $2, %al 1165; SKX-NEXT: jne .LBB16_3 1166; SKX-NEXT: .LBB16_4: # %else2 1167; SKX-NEXT: vmovaps %xmm2, %xmm0 1168; SKX-NEXT: retq 1169; SKX-NEXT: .LBB16_1: # %cond.load 1170; SKX-NEXT: vmovq %xmm0, %rcx 1171; SKX-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] 1172; SKX-NEXT: testb $2, %al 1173; SKX-NEXT: je .LBB16_4 1174; SKX-NEXT: .LBB16_3: # %cond.load1 1175; SKX-NEXT: vpextrq $1, %xmm0, %rax 1176; SKX-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] 1177; SKX-NEXT: vmovaps %xmm2, %xmm0 1178; SKX-NEXT: retq 1179; 1180; SKX_32-LABEL: test17: 1181; SKX_32: # %bb.0: 1182; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 1183; SKX_32-NEXT: vpmovq2m %xmm1, %k0 1184; SKX_32-NEXT: vpslld $3, %xmm0, %xmm0 1185; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 1186; SKX_32-NEXT: kmovw %k0, %eax 1187; SKX_32-NEXT: testb $1, %al 1188; SKX_32-NEXT: jne .LBB16_1 1189; SKX_32-NEXT: # %bb.2: # %else 1190; SKX_32-NEXT: testb $2, %al 1191; SKX_32-NEXT: jne .LBB16_3 1192; SKX_32-NEXT: .LBB16_4: # %else2 1193; SKX_32-NEXT: vmovaps %xmm2, %xmm0 1194; SKX_32-NEXT: retl 1195; SKX_32-NEXT: .LBB16_1: # %cond.load 1196; SKX_32-NEXT: vmovd %xmm0, %ecx 1197; SKX_32-NEXT: vmovlps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3] 1198; SKX_32-NEXT: testb $2, %al 1199; SKX_32-NEXT: je .LBB16_4 1200; SKX_32-NEXT: .LBB16_3: # %cond.load1 1201; SKX_32-NEXT: vpextrd $1, %xmm0, %eax 1202; SKX_32-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] 1203; SKX_32-NEXT: vmovaps %xmm2, %xmm0 1204; SKX_32-NEXT: retl 1205 1206 %sext_ind = sext <2 x i32> %ind to <2 x i64> 1207 %gep.random = getelementptr double, ptr %base, <2 x i64> %sext_ind 1208 %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0) 1209 ret <2 x double>%res 1210} 1211 1212declare void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> , <4 x ptr> , i32 , <4 x i1> ) 1213declare void @llvm.masked.scatter.v4f64.v4p0(<4 x double> , <4 x ptr> , i32 , <4 x i1> ) 1214declare void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> , <2 x ptr> , i32 , <2 x i1> ) 1215declare void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> , <2 x ptr> , i32 , <2 x i1> ) 1216declare void @llvm.masked.scatter.v2f32.v2p0(<2 x float> , <2 x ptr> , i32 , <2 x i1> ) 1217 1218define void @test18(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) { 1219; KNL_64-LABEL: test18: 1220; KNL_64: # %bb.0: 1221; KNL_64-NEXT: vpslld $31, %xmm2, %xmm2 1222; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k0 1223; KNL_64-NEXT: kmovw %k0, %eax 1224; KNL_64-NEXT: testb $1, %al 1225; KNL_64-NEXT: je .LBB17_2 1226; KNL_64-NEXT: # %bb.1: # %cond.store 1227; KNL_64-NEXT: vmovq %xmm1, %rcx 1228; KNL_64-NEXT: vmovss %xmm0, (%rcx) 1229; KNL_64-NEXT: .LBB17_2: # %else 1230; KNL_64-NEXT: testb $2, %al 1231; KNL_64-NEXT: je .LBB17_4 1232; KNL_64-NEXT: # %bb.3: # %cond.store1 1233; KNL_64-NEXT: vpextrq $1, %xmm1, %rcx 1234; KNL_64-NEXT: vextractps $1, %xmm0, (%rcx) 1235; KNL_64-NEXT: .LBB17_4: # %else2 1236; KNL_64-NEXT: testb $4, %al 1237; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm1 1238; KNL_64-NEXT: jne .LBB17_5 1239; KNL_64-NEXT: # %bb.6: # %else4 1240; KNL_64-NEXT: testb $8, %al 1241; KNL_64-NEXT: jne .LBB17_7 1242; KNL_64-NEXT: .LBB17_8: # %else6 1243; KNL_64-NEXT: vzeroupper 1244; KNL_64-NEXT: retq 1245; KNL_64-NEXT: .LBB17_5: # %cond.store3 1246; KNL_64-NEXT: vmovq %xmm1, %rcx 1247; KNL_64-NEXT: vextractps $2, %xmm0, (%rcx) 1248; KNL_64-NEXT: testb $8, %al 1249; KNL_64-NEXT: je .LBB17_8 1250; KNL_64-NEXT: .LBB17_7: # %cond.store5 1251; KNL_64-NEXT: vpextrq $1, %xmm1, %rax 1252; KNL_64-NEXT: vextractps $3, %xmm0, (%rax) 1253; KNL_64-NEXT: vzeroupper 1254; KNL_64-NEXT: retq 1255; 1256; KNL_32-LABEL: test18: 1257; KNL_32: # %bb.0: 1258; KNL_32-NEXT: vpslld $31, %xmm2, %xmm2 1259; KNL_32-NEXT: vptestmd %zmm2, %zmm2, %k0 1260; KNL_32-NEXT: kmovw %k0, %eax 1261; KNL_32-NEXT: testb $1, %al 1262; KNL_32-NEXT: jne .LBB17_1 1263; KNL_32-NEXT: # %bb.2: # %else 1264; KNL_32-NEXT: testb $2, %al 1265; KNL_32-NEXT: jne .LBB17_3 1266; KNL_32-NEXT: .LBB17_4: # %else2 1267; KNL_32-NEXT: testb $4, %al 1268; KNL_32-NEXT: jne .LBB17_5 1269; KNL_32-NEXT: .LBB17_6: # %else4 1270; KNL_32-NEXT: testb $8, %al 1271; KNL_32-NEXT: jne .LBB17_7 1272; KNL_32-NEXT: .LBB17_8: # %else6 1273; KNL_32-NEXT: vzeroupper 1274; KNL_32-NEXT: retl 1275; KNL_32-NEXT: .LBB17_1: # %cond.store 1276; KNL_32-NEXT: vmovd %xmm1, %ecx 1277; KNL_32-NEXT: vmovss %xmm0, (%ecx) 1278; KNL_32-NEXT: testb $2, %al 1279; KNL_32-NEXT: je .LBB17_4 1280; KNL_32-NEXT: .LBB17_3: # %cond.store1 1281; KNL_32-NEXT: vpextrd $1, %xmm1, %ecx 1282; KNL_32-NEXT: vextractps $1, %xmm0, (%ecx) 1283; KNL_32-NEXT: testb $4, %al 1284; KNL_32-NEXT: je .LBB17_6 1285; KNL_32-NEXT: .LBB17_5: # %cond.store3 1286; KNL_32-NEXT: vpextrd $2, %xmm1, %ecx 1287; KNL_32-NEXT: vextractps $2, %xmm0, (%ecx) 1288; KNL_32-NEXT: testb $8, %al 1289; KNL_32-NEXT: je .LBB17_8 1290; KNL_32-NEXT: .LBB17_7: # %cond.store5 1291; KNL_32-NEXT: vpextrd $3, %xmm1, %eax 1292; KNL_32-NEXT: vextractps $3, %xmm0, (%eax) 1293; KNL_32-NEXT: vzeroupper 1294; KNL_32-NEXT: retl 1295; 1296; SKX-LABEL: test18: 1297; SKX: # %bb.0: 1298; SKX-NEXT: vpslld $31, %xmm2, %xmm2 1299; SKX-NEXT: vpmovd2m %xmm2, %k1 1300; SKX-NEXT: vpscatterqd %xmm0, (,%ymm1) {%k1} 1301; SKX-NEXT: vzeroupper 1302; SKX-NEXT: retq 1303; 1304; SKX_32-LABEL: test18: 1305; SKX_32: # %bb.0: 1306; SKX_32-NEXT: vpslld $31, %xmm2, %xmm2 1307; SKX_32-NEXT: vpmovd2m %xmm2, %k1 1308; SKX_32-NEXT: vpscatterdd %xmm0, (,%xmm1) {%k1} 1309; SKX_32-NEXT: retl 1310 call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %a1, <4 x ptr> %ptr, i32 4, <4 x i1> %mask) 1311 ret void 1312} 1313 1314define void @test19(<4 x double>%a1, ptr %ptr, <4 x i1>%mask, <4 x i64> %ind) { 1315; KNL_64-LABEL: test19: 1316; KNL_64: # %bb.0: 1317; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1 1318; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0 1319; KNL_64-NEXT: vpsllq $3, %ymm2, %ymm1 1320; KNL_64-NEXT: vmovq %rdi, %xmm2 1321; KNL_64-NEXT: vpbroadcastq %xmm2, %ymm2 1322; KNL_64-NEXT: vpaddq %ymm1, %ymm2, %ymm1 1323; KNL_64-NEXT: kmovw %k0, %eax 1324; KNL_64-NEXT: testb $1, %al 1325; KNL_64-NEXT: je .LBB18_2 1326; KNL_64-NEXT: # %bb.1: # %cond.store 1327; KNL_64-NEXT: vmovq %xmm1, %rcx 1328; KNL_64-NEXT: vmovlps %xmm0, (%rcx) 1329; KNL_64-NEXT: .LBB18_2: # %else 1330; KNL_64-NEXT: testb $2, %al 1331; KNL_64-NEXT: je .LBB18_4 1332; KNL_64-NEXT: # %bb.3: # %cond.store1 1333; KNL_64-NEXT: vpextrq $1, %xmm1, %rcx 1334; KNL_64-NEXT: vmovhps %xmm0, (%rcx) 1335; KNL_64-NEXT: .LBB18_4: # %else2 1336; KNL_64-NEXT: testb $4, %al 1337; KNL_64-NEXT: vextractf128 $1, %ymm0, %xmm0 1338; KNL_64-NEXT: vextracti128 $1, %ymm1, %xmm1 1339; KNL_64-NEXT: jne .LBB18_5 1340; KNL_64-NEXT: # %bb.6: # %else4 1341; KNL_64-NEXT: testb $8, %al 1342; KNL_64-NEXT: jne .LBB18_7 1343; KNL_64-NEXT: .LBB18_8: # %else6 1344; KNL_64-NEXT: vzeroupper 1345; KNL_64-NEXT: retq 1346; KNL_64-NEXT: .LBB18_5: # %cond.store3 1347; KNL_64-NEXT: vmovq %xmm1, %rcx 1348; KNL_64-NEXT: vmovlps %xmm0, (%rcx) 1349; KNL_64-NEXT: testb $8, %al 1350; KNL_64-NEXT: je .LBB18_8 1351; KNL_64-NEXT: .LBB18_7: # %cond.store5 1352; KNL_64-NEXT: vpextrq $1, %xmm1, %rax 1353; KNL_64-NEXT: vmovhps %xmm0, (%rax) 1354; KNL_64-NEXT: vzeroupper 1355; KNL_64-NEXT: retq 1356; 1357; KNL_32-LABEL: test19: 1358; KNL_32: # %bb.0: 1359; KNL_32-NEXT: # kill: def $ymm2 killed $ymm2 def $zmm2 1360; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 1361; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0 1362; KNL_32-NEXT: vpmovqd %zmm2, %ymm1 1363; KNL_32-NEXT: vpslld $3, %xmm1, %xmm1 1364; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2 1365; KNL_32-NEXT: vpaddd %xmm1, %xmm2, %xmm1 1366; KNL_32-NEXT: kmovw %k0, %eax 1367; KNL_32-NEXT: testb $1, %al 1368; KNL_32-NEXT: je .LBB18_2 1369; KNL_32-NEXT: # %bb.1: # %cond.store 1370; KNL_32-NEXT: vmovd %xmm1, %ecx 1371; KNL_32-NEXT: vmovlps %xmm0, (%ecx) 1372; KNL_32-NEXT: .LBB18_2: # %else 1373; KNL_32-NEXT: testb $2, %al 1374; KNL_32-NEXT: je .LBB18_4 1375; KNL_32-NEXT: # %bb.3: # %cond.store1 1376; KNL_32-NEXT: vpextrd $1, %xmm1, %ecx 1377; KNL_32-NEXT: vmovhps %xmm0, (%ecx) 1378; KNL_32-NEXT: .LBB18_4: # %else2 1379; KNL_32-NEXT: testb $4, %al 1380; KNL_32-NEXT: vextractf128 $1, %ymm0, %xmm0 1381; KNL_32-NEXT: jne .LBB18_5 1382; KNL_32-NEXT: # %bb.6: # %else4 1383; KNL_32-NEXT: testb $8, %al 1384; KNL_32-NEXT: jne .LBB18_7 1385; KNL_32-NEXT: .LBB18_8: # %else6 1386; KNL_32-NEXT: vzeroupper 1387; KNL_32-NEXT: retl 1388; KNL_32-NEXT: .LBB18_5: # %cond.store3 1389; KNL_32-NEXT: vpextrd $2, %xmm1, %ecx 1390; KNL_32-NEXT: vmovlps %xmm0, (%ecx) 1391; KNL_32-NEXT: testb $8, %al 1392; KNL_32-NEXT: je .LBB18_8 1393; KNL_32-NEXT: .LBB18_7: # %cond.store5 1394; KNL_32-NEXT: vpextrd $3, %xmm1, %eax 1395; KNL_32-NEXT: vmovhps %xmm0, (%eax) 1396; KNL_32-NEXT: vzeroupper 1397; KNL_32-NEXT: retl 1398; 1399; SKX-LABEL: test19: 1400; SKX: # %bb.0: 1401; SKX-NEXT: vpslld $31, %xmm1, %xmm1 1402; SKX-NEXT: vpmovd2m %xmm1, %k1 1403; SKX-NEXT: vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1} 1404; SKX-NEXT: vzeroupper 1405; SKX-NEXT: retq 1406; 1407; SKX_32-LABEL: test19: 1408; SKX_32: # %bb.0: 1409; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1 1410; SKX_32-NEXT: vpmovd2m %xmm1, %k1 1411; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 1412; SKX_32-NEXT: vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1} 1413; SKX_32-NEXT: vzeroupper 1414; SKX_32-NEXT: retl 1415 %gep = getelementptr double, ptr %ptr, <4 x i64> %ind 1416 call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> %a1, <4 x ptr> %gep, i32 8, <4 x i1> %mask) 1417 ret void 1418} 1419 1420; Data type requires widening 1421define void @test20(<2 x float>%a1, <2 x ptr> %ptr, <2 x i1> %mask) { 1422; KNL_64-LABEL: test20: 1423; KNL_64: # %bb.0: 1424; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2 1425; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0 1426; KNL_64-NEXT: kmovw %k0, %eax 1427; KNL_64-NEXT: testb $1, %al 1428; KNL_64-NEXT: jne .LBB19_1 1429; KNL_64-NEXT: # %bb.2: # %else 1430; KNL_64-NEXT: testb $2, %al 1431; KNL_64-NEXT: jne .LBB19_3 1432; KNL_64-NEXT: .LBB19_4: # %else2 1433; KNL_64-NEXT: vzeroupper 1434; KNL_64-NEXT: retq 1435; KNL_64-NEXT: .LBB19_1: # %cond.store 1436; KNL_64-NEXT: vmovq %xmm1, %rcx 1437; KNL_64-NEXT: vmovd %xmm0, (%rcx) 1438; KNL_64-NEXT: testb $2, %al 1439; KNL_64-NEXT: je .LBB19_4 1440; KNL_64-NEXT: .LBB19_3: # %cond.store1 1441; KNL_64-NEXT: vpextrq $1, %xmm1, %rax 1442; KNL_64-NEXT: vextractps $1, %xmm0, (%rax) 1443; KNL_64-NEXT: vzeroupper 1444; KNL_64-NEXT: retq 1445; 1446; KNL_32-LABEL: test20: 1447; KNL_32: # %bb.0: 1448; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2 1449; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0 1450; KNL_32-NEXT: kmovw %k0, %eax 1451; KNL_32-NEXT: testb $1, %al 1452; KNL_32-NEXT: jne .LBB19_1 1453; KNL_32-NEXT: # %bb.2: # %else 1454; KNL_32-NEXT: testb $2, %al 1455; KNL_32-NEXT: jne .LBB19_3 1456; KNL_32-NEXT: .LBB19_4: # %else2 1457; KNL_32-NEXT: vzeroupper 1458; KNL_32-NEXT: retl 1459; KNL_32-NEXT: .LBB19_1: # %cond.store 1460; KNL_32-NEXT: vmovd %xmm1, %ecx 1461; KNL_32-NEXT: vmovd %xmm0, (%ecx) 1462; KNL_32-NEXT: testb $2, %al 1463; KNL_32-NEXT: je .LBB19_4 1464; KNL_32-NEXT: .LBB19_3: # %cond.store1 1465; KNL_32-NEXT: vpextrd $1, %xmm1, %eax 1466; KNL_32-NEXT: vextractps $1, %xmm0, (%eax) 1467; KNL_32-NEXT: vzeroupper 1468; KNL_32-NEXT: retl 1469; 1470; SKX-LABEL: test20: 1471; SKX: # %bb.0: 1472; SKX-NEXT: vpsllq $63, %xmm2, %xmm2 1473; SKX-NEXT: vpmovq2m %xmm2, %k0 1474; SKX-NEXT: kmovw %k0, %eax 1475; SKX-NEXT: testb $1, %al 1476; SKX-NEXT: jne .LBB19_1 1477; SKX-NEXT: # %bb.2: # %else 1478; SKX-NEXT: testb $2, %al 1479; SKX-NEXT: jne .LBB19_3 1480; SKX-NEXT: .LBB19_4: # %else2 1481; SKX-NEXT: retq 1482; SKX-NEXT: .LBB19_1: # %cond.store 1483; SKX-NEXT: vmovq %xmm1, %rcx 1484; SKX-NEXT: vmovd %xmm0, (%rcx) 1485; SKX-NEXT: testb $2, %al 1486; SKX-NEXT: je .LBB19_4 1487; SKX-NEXT: .LBB19_3: # %cond.store1 1488; SKX-NEXT: vpextrq $1, %xmm1, %rax 1489; SKX-NEXT: vextractps $1, %xmm0, (%rax) 1490; SKX-NEXT: retq 1491; 1492; SKX_32-LABEL: test20: 1493; SKX_32: # %bb.0: 1494; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2 1495; SKX_32-NEXT: vpmovq2m %xmm2, %k0 1496; SKX_32-NEXT: kmovw %k0, %eax 1497; SKX_32-NEXT: testb $1, %al 1498; SKX_32-NEXT: jne .LBB19_1 1499; SKX_32-NEXT: # %bb.2: # %else 1500; SKX_32-NEXT: testb $2, %al 1501; SKX_32-NEXT: jne .LBB19_3 1502; SKX_32-NEXT: .LBB19_4: # %else2 1503; SKX_32-NEXT: retl 1504; SKX_32-NEXT: .LBB19_1: # %cond.store 1505; SKX_32-NEXT: vmovd %xmm1, %ecx 1506; SKX_32-NEXT: vmovd %xmm0, (%ecx) 1507; SKX_32-NEXT: testb $2, %al 1508; SKX_32-NEXT: je .LBB19_4 1509; SKX_32-NEXT: .LBB19_3: # %cond.store1 1510; SKX_32-NEXT: vpextrd $1, %xmm1, %eax 1511; SKX_32-NEXT: vextractps $1, %xmm0, (%eax) 1512; SKX_32-NEXT: retl 1513 call void @llvm.masked.scatter.v2f32.v2p0(<2 x float> %a1, <2 x ptr> %ptr, i32 4, <2 x i1> %mask) 1514 ret void 1515} 1516 1517; Data type requires promotion 1518define void @test21(<2 x i32>%a1, <2 x ptr> %ptr, <2 x i1>%mask) { 1519; KNL_64-LABEL: test21: 1520; KNL_64: # %bb.0: 1521; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2 1522; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0 1523; KNL_64-NEXT: kmovw %k0, %eax 1524; KNL_64-NEXT: testb $1, %al 1525; KNL_64-NEXT: jne .LBB20_1 1526; KNL_64-NEXT: # %bb.2: # %else 1527; KNL_64-NEXT: testb $2, %al 1528; KNL_64-NEXT: jne .LBB20_3 1529; KNL_64-NEXT: .LBB20_4: # %else2 1530; KNL_64-NEXT: vzeroupper 1531; KNL_64-NEXT: retq 1532; KNL_64-NEXT: .LBB20_1: # %cond.store 1533; KNL_64-NEXT: vmovq %xmm1, %rcx 1534; KNL_64-NEXT: vmovss %xmm0, (%rcx) 1535; KNL_64-NEXT: testb $2, %al 1536; KNL_64-NEXT: je .LBB20_4 1537; KNL_64-NEXT: .LBB20_3: # %cond.store1 1538; KNL_64-NEXT: vpextrq $1, %xmm1, %rax 1539; KNL_64-NEXT: vextractps $1, %xmm0, (%rax) 1540; KNL_64-NEXT: vzeroupper 1541; KNL_64-NEXT: retq 1542; 1543; KNL_32-LABEL: test21: 1544; KNL_32: # %bb.0: 1545; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2 1546; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0 1547; KNL_32-NEXT: kmovw %k0, %eax 1548; KNL_32-NEXT: testb $1, %al 1549; KNL_32-NEXT: jne .LBB20_1 1550; KNL_32-NEXT: # %bb.2: # %else 1551; KNL_32-NEXT: testb $2, %al 1552; KNL_32-NEXT: jne .LBB20_3 1553; KNL_32-NEXT: .LBB20_4: # %else2 1554; KNL_32-NEXT: vzeroupper 1555; KNL_32-NEXT: retl 1556; KNL_32-NEXT: .LBB20_1: # %cond.store 1557; KNL_32-NEXT: vmovd %xmm1, %ecx 1558; KNL_32-NEXT: vmovss %xmm0, (%ecx) 1559; KNL_32-NEXT: testb $2, %al 1560; KNL_32-NEXT: je .LBB20_4 1561; KNL_32-NEXT: .LBB20_3: # %cond.store1 1562; KNL_32-NEXT: vpextrd $1, %xmm1, %eax 1563; KNL_32-NEXT: vextractps $1, %xmm0, (%eax) 1564; KNL_32-NEXT: vzeroupper 1565; KNL_32-NEXT: retl 1566; 1567; SKX-LABEL: test21: 1568; SKX: # %bb.0: 1569; SKX-NEXT: vpsllq $63, %xmm2, %xmm2 1570; SKX-NEXT: vpmovq2m %xmm2, %k0 1571; SKX-NEXT: kmovw %k0, %eax 1572; SKX-NEXT: testb $1, %al 1573; SKX-NEXT: jne .LBB20_1 1574; SKX-NEXT: # %bb.2: # %else 1575; SKX-NEXT: testb $2, %al 1576; SKX-NEXT: jne .LBB20_3 1577; SKX-NEXT: .LBB20_4: # %else2 1578; SKX-NEXT: retq 1579; SKX-NEXT: .LBB20_1: # %cond.store 1580; SKX-NEXT: vmovq %xmm1, %rcx 1581; SKX-NEXT: vmovss %xmm0, (%rcx) 1582; SKX-NEXT: testb $2, %al 1583; SKX-NEXT: je .LBB20_4 1584; SKX-NEXT: .LBB20_3: # %cond.store1 1585; SKX-NEXT: vpextrq $1, %xmm1, %rax 1586; SKX-NEXT: vextractps $1, %xmm0, (%rax) 1587; SKX-NEXT: retq 1588; 1589; SKX_32-LABEL: test21: 1590; SKX_32: # %bb.0: 1591; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2 1592; SKX_32-NEXT: vpmovq2m %xmm2, %k0 1593; SKX_32-NEXT: kmovw %k0, %eax 1594; SKX_32-NEXT: testb $1, %al 1595; SKX_32-NEXT: jne .LBB20_1 1596; SKX_32-NEXT: # %bb.2: # %else 1597; SKX_32-NEXT: testb $2, %al 1598; SKX_32-NEXT: jne .LBB20_3 1599; SKX_32-NEXT: .LBB20_4: # %else2 1600; SKX_32-NEXT: retl 1601; SKX_32-NEXT: .LBB20_1: # %cond.store 1602; SKX_32-NEXT: vmovd %xmm1, %ecx 1603; SKX_32-NEXT: vmovss %xmm0, (%ecx) 1604; SKX_32-NEXT: testb $2, %al 1605; SKX_32-NEXT: je .LBB20_4 1606; SKX_32-NEXT: .LBB20_3: # %cond.store1 1607; SKX_32-NEXT: vpextrd $1, %xmm1, %eax 1608; SKX_32-NEXT: vextractps $1, %xmm0, (%eax) 1609; SKX_32-NEXT: retl 1610 call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> %a1, <2 x ptr> %ptr, i32 4, <2 x i1> %mask) 1611 ret void 1612} 1613 1614; The result type requires widening 1615declare <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x float>) 1616 1617define <2 x float> @test22(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float> %src0) { 1618; KNL_64-LABEL: test22: 1619; KNL_64: # %bb.0: 1620; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 1621; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 1622; KNL_64-NEXT: vpmovsxdq %xmm0, %xmm0 1623; KNL_64-NEXT: vpsllq $2, %xmm0, %xmm0 1624; KNL_64-NEXT: vmovq %rdi, %xmm1 1625; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 1626; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 1627; KNL_64-NEXT: kmovw %k0, %eax 1628; KNL_64-NEXT: testb $1, %al 1629; KNL_64-NEXT: jne .LBB21_1 1630; KNL_64-NEXT: # %bb.2: # %else 1631; KNL_64-NEXT: testb $2, %al 1632; KNL_64-NEXT: jne .LBB21_3 1633; KNL_64-NEXT: .LBB21_4: # %else2 1634; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 1635; KNL_64-NEXT: vzeroupper 1636; KNL_64-NEXT: retq 1637; KNL_64-NEXT: .LBB21_1: # %cond.load 1638; KNL_64-NEXT: vmovq %xmm0, %rcx 1639; KNL_64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1640; KNL_64-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 1641; KNL_64-NEXT: testb $2, %al 1642; KNL_64-NEXT: je .LBB21_4 1643; KNL_64-NEXT: .LBB21_3: # %cond.load1 1644; KNL_64-NEXT: vpextrq $1, %xmm0, %rax 1645; KNL_64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] 1646; KNL_64-NEXT: vmovaps %xmm2, %xmm0 1647; KNL_64-NEXT: vzeroupper 1648; KNL_64-NEXT: retq 1649; 1650; KNL_32-LABEL: test22: 1651; KNL_32: # %bb.0: 1652; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 1653; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 1654; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 1655; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 1656; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 1657; KNL_32-NEXT: kmovw %k0, %eax 1658; KNL_32-NEXT: testb $1, %al 1659; KNL_32-NEXT: jne .LBB21_1 1660; KNL_32-NEXT: # %bb.2: # %else 1661; KNL_32-NEXT: testb $2, %al 1662; KNL_32-NEXT: jne .LBB21_3 1663; KNL_32-NEXT: .LBB21_4: # %else2 1664; KNL_32-NEXT: vmovdqa %xmm2, %xmm0 1665; KNL_32-NEXT: vzeroupper 1666; KNL_32-NEXT: retl 1667; KNL_32-NEXT: .LBB21_1: # %cond.load 1668; KNL_32-NEXT: vmovd %xmm0, %ecx 1669; KNL_32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1670; KNL_32-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 1671; KNL_32-NEXT: testb $2, %al 1672; KNL_32-NEXT: je .LBB21_4 1673; KNL_32-NEXT: .LBB21_3: # %cond.load1 1674; KNL_32-NEXT: vpextrd $1, %xmm0, %eax 1675; KNL_32-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] 1676; KNL_32-NEXT: vmovaps %xmm2, %xmm0 1677; KNL_32-NEXT: vzeroupper 1678; KNL_32-NEXT: retl 1679; 1680; SKX-LABEL: test22: 1681; SKX: # %bb.0: 1682; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 1683; SKX-NEXT: vpmovq2m %xmm1, %k0 1684; SKX-NEXT: vpmovsxdq %xmm0, %xmm0 1685; SKX-NEXT: vpbroadcastq %rdi, %xmm1 1686; SKX-NEXT: vpsllq $2, %xmm0, %xmm0 1687; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 1688; SKX-NEXT: kmovw %k0, %eax 1689; SKX-NEXT: testb $1, %al 1690; SKX-NEXT: jne .LBB21_1 1691; SKX-NEXT: # %bb.2: # %else 1692; SKX-NEXT: testb $2, %al 1693; SKX-NEXT: jne .LBB21_3 1694; SKX-NEXT: .LBB21_4: # %else2 1695; SKX-NEXT: vmovdqa %xmm2, %xmm0 1696; SKX-NEXT: retq 1697; SKX-NEXT: .LBB21_1: # %cond.load 1698; SKX-NEXT: vmovq %xmm0, %rcx 1699; SKX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1700; SKX-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 1701; SKX-NEXT: testb $2, %al 1702; SKX-NEXT: je .LBB21_4 1703; SKX-NEXT: .LBB21_3: # %cond.load1 1704; SKX-NEXT: vpextrq $1, %xmm0, %rax 1705; SKX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] 1706; SKX-NEXT: vmovaps %xmm2, %xmm0 1707; SKX-NEXT: retq 1708; 1709; SKX_32-LABEL: test22: 1710; SKX_32: # %bb.0: 1711; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 1712; SKX_32-NEXT: vpmovq2m %xmm1, %k0 1713; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0 1714; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 1715; SKX_32-NEXT: kmovw %k0, %eax 1716; SKX_32-NEXT: testb $1, %al 1717; SKX_32-NEXT: jne .LBB21_1 1718; SKX_32-NEXT: # %bb.2: # %else 1719; SKX_32-NEXT: testb $2, %al 1720; SKX_32-NEXT: jne .LBB21_3 1721; SKX_32-NEXT: .LBB21_4: # %else2 1722; SKX_32-NEXT: vmovdqa %xmm2, %xmm0 1723; SKX_32-NEXT: retl 1724; SKX_32-NEXT: .LBB21_1: # %cond.load 1725; SKX_32-NEXT: vmovd %xmm0, %ecx 1726; SKX_32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1727; SKX_32-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 1728; SKX_32-NEXT: testb $2, %al 1729; SKX_32-NEXT: je .LBB21_4 1730; SKX_32-NEXT: .LBB21_3: # %cond.load1 1731; SKX_32-NEXT: vpextrd $1, %xmm0, %eax 1732; SKX_32-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] 1733; SKX_32-NEXT: vmovaps %xmm2, %xmm0 1734; SKX_32-NEXT: retl 1735 %sext_ind = sext <2 x i32> %ind to <2 x i64> 1736 %gep.random = getelementptr float, ptr %base, <2 x i64> %sext_ind 1737 %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0) 1738 ret <2 x float>%res 1739} 1740 1741define <2 x float> @test22a(ptr %base, <2 x i64> %ind, <2 x i1> %mask, <2 x float> %src0) { 1742; KNL_64-LABEL: test22a: 1743; KNL_64: # %bb.0: 1744; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 1745; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 1746; KNL_64-NEXT: vpsllq $2, %xmm0, %xmm0 1747; KNL_64-NEXT: vmovq %rdi, %xmm1 1748; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 1749; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 1750; KNL_64-NEXT: kmovw %k0, %eax 1751; KNL_64-NEXT: testb $1, %al 1752; KNL_64-NEXT: jne .LBB22_1 1753; KNL_64-NEXT: # %bb.2: # %else 1754; KNL_64-NEXT: testb $2, %al 1755; KNL_64-NEXT: jne .LBB22_3 1756; KNL_64-NEXT: .LBB22_4: # %else2 1757; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 1758; KNL_64-NEXT: vzeroupper 1759; KNL_64-NEXT: retq 1760; KNL_64-NEXT: .LBB22_1: # %cond.load 1761; KNL_64-NEXT: vmovq %xmm0, %rcx 1762; KNL_64-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1763; KNL_64-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 1764; KNL_64-NEXT: testb $2, %al 1765; KNL_64-NEXT: je .LBB22_4 1766; KNL_64-NEXT: .LBB22_3: # %cond.load1 1767; KNL_64-NEXT: vpextrq $1, %xmm0, %rax 1768; KNL_64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] 1769; KNL_64-NEXT: vmovaps %xmm2, %xmm0 1770; KNL_64-NEXT: vzeroupper 1771; KNL_64-NEXT: retq 1772; 1773; KNL_32-LABEL: test22a: 1774; KNL_32: # %bb.0: 1775; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 1776; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 1777; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1778; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 1779; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 1780; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 1781; KNL_32-NEXT: kmovw %k0, %eax 1782; KNL_32-NEXT: testb $1, %al 1783; KNL_32-NEXT: jne .LBB22_1 1784; KNL_32-NEXT: # %bb.2: # %else 1785; KNL_32-NEXT: testb $2, %al 1786; KNL_32-NEXT: jne .LBB22_3 1787; KNL_32-NEXT: .LBB22_4: # %else2 1788; KNL_32-NEXT: vmovdqa %xmm2, %xmm0 1789; KNL_32-NEXT: vzeroupper 1790; KNL_32-NEXT: retl 1791; KNL_32-NEXT: .LBB22_1: # %cond.load 1792; KNL_32-NEXT: vmovd %xmm0, %ecx 1793; KNL_32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1794; KNL_32-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 1795; KNL_32-NEXT: testb $2, %al 1796; KNL_32-NEXT: je .LBB22_4 1797; KNL_32-NEXT: .LBB22_3: # %cond.load1 1798; KNL_32-NEXT: vpextrd $1, %xmm0, %eax 1799; KNL_32-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] 1800; KNL_32-NEXT: vmovaps %xmm2, %xmm0 1801; KNL_32-NEXT: vzeroupper 1802; KNL_32-NEXT: retl 1803; 1804; SKX-LABEL: test22a: 1805; SKX: # %bb.0: 1806; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 1807; SKX-NEXT: vpmovq2m %xmm1, %k0 1808; SKX-NEXT: vpsllq $2, %xmm0, %xmm0 1809; SKX-NEXT: vpbroadcastq %rdi, %xmm1 1810; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 1811; SKX-NEXT: kmovw %k0, %eax 1812; SKX-NEXT: testb $1, %al 1813; SKX-NEXT: jne .LBB22_1 1814; SKX-NEXT: # %bb.2: # %else 1815; SKX-NEXT: testb $2, %al 1816; SKX-NEXT: jne .LBB22_3 1817; SKX-NEXT: .LBB22_4: # %else2 1818; SKX-NEXT: vmovdqa %xmm2, %xmm0 1819; SKX-NEXT: retq 1820; SKX-NEXT: .LBB22_1: # %cond.load 1821; SKX-NEXT: vmovq %xmm0, %rcx 1822; SKX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1823; SKX-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 1824; SKX-NEXT: testb $2, %al 1825; SKX-NEXT: je .LBB22_4 1826; SKX-NEXT: .LBB22_3: # %cond.load1 1827; SKX-NEXT: vpextrq $1, %xmm0, %rax 1828; SKX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] 1829; SKX-NEXT: vmovaps %xmm2, %xmm0 1830; SKX-NEXT: retq 1831; 1832; SKX_32-LABEL: test22a: 1833; SKX_32: # %bb.0: 1834; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 1835; SKX_32-NEXT: vpmovq2m %xmm1, %k0 1836; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 1837; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0 1838; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 1839; SKX_32-NEXT: kmovw %k0, %eax 1840; SKX_32-NEXT: testb $1, %al 1841; SKX_32-NEXT: jne .LBB22_1 1842; SKX_32-NEXT: # %bb.2: # %else 1843; SKX_32-NEXT: testb $2, %al 1844; SKX_32-NEXT: jne .LBB22_3 1845; SKX_32-NEXT: .LBB22_4: # %else2 1846; SKX_32-NEXT: vmovdqa %xmm2, %xmm0 1847; SKX_32-NEXT: retl 1848; SKX_32-NEXT: .LBB22_1: # %cond.load 1849; SKX_32-NEXT: vmovd %xmm0, %ecx 1850; SKX_32-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 1851; SKX_32-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 1852; SKX_32-NEXT: testb $2, %al 1853; SKX_32-NEXT: je .LBB22_4 1854; SKX_32-NEXT: .LBB22_3: # %cond.load1 1855; SKX_32-NEXT: vpextrd $1, %xmm0, %eax 1856; SKX_32-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3] 1857; SKX_32-NEXT: vmovaps %xmm2, %xmm0 1858; SKX_32-NEXT: retl 1859 %gep.random = getelementptr float, ptr %base, <2 x i64> %ind 1860 %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0) 1861 ret <2 x float>%res 1862} 1863 1864declare <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i32>) 1865declare <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i64>) 1866 1867define <2 x i32> @test23(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) { 1868; KNL_64-LABEL: test23: 1869; KNL_64: # %bb.0: 1870; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 1871; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 1872; KNL_64-NEXT: vpmovsxdq %xmm0, %xmm0 1873; KNL_64-NEXT: vpsllq $2, %xmm0, %xmm0 1874; KNL_64-NEXT: vmovq %rdi, %xmm1 1875; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 1876; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 1877; KNL_64-NEXT: kmovw %k0, %eax 1878; KNL_64-NEXT: testb $1, %al 1879; KNL_64-NEXT: jne .LBB23_1 1880; KNL_64-NEXT: # %bb.2: # %else 1881; KNL_64-NEXT: testb $2, %al 1882; KNL_64-NEXT: jne .LBB23_3 1883; KNL_64-NEXT: .LBB23_4: # %else2 1884; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 1885; KNL_64-NEXT: vzeroupper 1886; KNL_64-NEXT: retq 1887; KNL_64-NEXT: .LBB23_1: # %cond.load 1888; KNL_64-NEXT: vmovq %xmm0, %rcx 1889; KNL_64-NEXT: vpinsrd $0, (%rcx), %xmm2, %xmm2 1890; KNL_64-NEXT: testb $2, %al 1891; KNL_64-NEXT: je .LBB23_4 1892; KNL_64-NEXT: .LBB23_3: # %cond.load1 1893; KNL_64-NEXT: vpextrq $1, %xmm0, %rax 1894; KNL_64-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2 1895; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 1896; KNL_64-NEXT: vzeroupper 1897; KNL_64-NEXT: retq 1898; 1899; KNL_32-LABEL: test23: 1900; KNL_32: # %bb.0: 1901; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 1902; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 1903; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 1904; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 1905; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 1906; KNL_32-NEXT: kmovw %k0, %eax 1907; KNL_32-NEXT: testb $1, %al 1908; KNL_32-NEXT: jne .LBB23_1 1909; KNL_32-NEXT: # %bb.2: # %else 1910; KNL_32-NEXT: testb $2, %al 1911; KNL_32-NEXT: jne .LBB23_3 1912; KNL_32-NEXT: .LBB23_4: # %else2 1913; KNL_32-NEXT: vmovdqa %xmm2, %xmm0 1914; KNL_32-NEXT: vzeroupper 1915; KNL_32-NEXT: retl 1916; KNL_32-NEXT: .LBB23_1: # %cond.load 1917; KNL_32-NEXT: vmovd %xmm0, %ecx 1918; KNL_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm2 1919; KNL_32-NEXT: testb $2, %al 1920; KNL_32-NEXT: je .LBB23_4 1921; KNL_32-NEXT: .LBB23_3: # %cond.load1 1922; KNL_32-NEXT: vpextrd $1, %xmm0, %eax 1923; KNL_32-NEXT: vpinsrd $1, (%eax), %xmm2, %xmm2 1924; KNL_32-NEXT: vmovdqa %xmm2, %xmm0 1925; KNL_32-NEXT: vzeroupper 1926; KNL_32-NEXT: retl 1927; 1928; SKX-LABEL: test23: 1929; SKX: # %bb.0: 1930; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 1931; SKX-NEXT: vpmovq2m %xmm1, %k0 1932; SKX-NEXT: vpmovsxdq %xmm0, %xmm0 1933; SKX-NEXT: vpbroadcastq %rdi, %xmm1 1934; SKX-NEXT: vpsllq $2, %xmm0, %xmm0 1935; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 1936; SKX-NEXT: kmovw %k0, %eax 1937; SKX-NEXT: testb $1, %al 1938; SKX-NEXT: jne .LBB23_1 1939; SKX-NEXT: # %bb.2: # %else 1940; SKX-NEXT: testb $2, %al 1941; SKX-NEXT: jne .LBB23_3 1942; SKX-NEXT: .LBB23_4: # %else2 1943; SKX-NEXT: vmovdqa %xmm2, %xmm0 1944; SKX-NEXT: retq 1945; SKX-NEXT: .LBB23_1: # %cond.load 1946; SKX-NEXT: vmovq %xmm0, %rcx 1947; SKX-NEXT: vpinsrd $0, (%rcx), %xmm2, %xmm2 1948; SKX-NEXT: testb $2, %al 1949; SKX-NEXT: je .LBB23_4 1950; SKX-NEXT: .LBB23_3: # %cond.load1 1951; SKX-NEXT: vpextrq $1, %xmm0, %rax 1952; SKX-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2 1953; SKX-NEXT: vmovdqa %xmm2, %xmm0 1954; SKX-NEXT: retq 1955; 1956; SKX_32-LABEL: test23: 1957; SKX_32: # %bb.0: 1958; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 1959; SKX_32-NEXT: vpmovq2m %xmm1, %k0 1960; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0 1961; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 1962; SKX_32-NEXT: kmovw %k0, %eax 1963; SKX_32-NEXT: testb $1, %al 1964; SKX_32-NEXT: jne .LBB23_1 1965; SKX_32-NEXT: # %bb.2: # %else 1966; SKX_32-NEXT: testb $2, %al 1967; SKX_32-NEXT: jne .LBB23_3 1968; SKX_32-NEXT: .LBB23_4: # %else2 1969; SKX_32-NEXT: vmovdqa %xmm2, %xmm0 1970; SKX_32-NEXT: retl 1971; SKX_32-NEXT: .LBB23_1: # %cond.load 1972; SKX_32-NEXT: vmovd %xmm0, %ecx 1973; SKX_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm2 1974; SKX_32-NEXT: testb $2, %al 1975; SKX_32-NEXT: je .LBB23_4 1976; SKX_32-NEXT: .LBB23_3: # %cond.load1 1977; SKX_32-NEXT: vpextrd $1, %xmm0, %eax 1978; SKX_32-NEXT: vpinsrd $1, (%eax), %xmm2, %xmm2 1979; SKX_32-NEXT: vmovdqa %xmm2, %xmm0 1980; SKX_32-NEXT: retl 1981 %sext_ind = sext <2 x i32> %ind to <2 x i64> 1982 %gep.random = getelementptr i32, ptr %base, <2 x i64> %sext_ind 1983 %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0) 1984 ret <2 x i32>%res 1985} 1986 1987define <2 x i32> @test23b(ptr %base, <2 x i64> %ind, <2 x i1> %mask, <2 x i32> %src0) { 1988; KNL_64-LABEL: test23b: 1989; KNL_64: # %bb.0: 1990; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 1991; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 1992; KNL_64-NEXT: vpsllq $2, %xmm0, %xmm0 1993; KNL_64-NEXT: vmovq %rdi, %xmm1 1994; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 1995; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 1996; KNL_64-NEXT: kmovw %k0, %eax 1997; KNL_64-NEXT: testb $1, %al 1998; KNL_64-NEXT: jne .LBB24_1 1999; KNL_64-NEXT: # %bb.2: # %else 2000; KNL_64-NEXT: testb $2, %al 2001; KNL_64-NEXT: jne .LBB24_3 2002; KNL_64-NEXT: .LBB24_4: # %else2 2003; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 2004; KNL_64-NEXT: vzeroupper 2005; KNL_64-NEXT: retq 2006; KNL_64-NEXT: .LBB24_1: # %cond.load 2007; KNL_64-NEXT: vmovq %xmm0, %rcx 2008; KNL_64-NEXT: vpinsrd $0, (%rcx), %xmm2, %xmm2 2009; KNL_64-NEXT: testb $2, %al 2010; KNL_64-NEXT: je .LBB24_4 2011; KNL_64-NEXT: .LBB24_3: # %cond.load1 2012; KNL_64-NEXT: vpextrq $1, %xmm0, %rax 2013; KNL_64-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2 2014; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 2015; KNL_64-NEXT: vzeroupper 2016; KNL_64-NEXT: retq 2017; 2018; KNL_32-LABEL: test23b: 2019; KNL_32: # %bb.0: 2020; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 2021; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 2022; KNL_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2023; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 2024; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 2025; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 2026; KNL_32-NEXT: kmovw %k0, %eax 2027; KNL_32-NEXT: testb $1, %al 2028; KNL_32-NEXT: jne .LBB24_1 2029; KNL_32-NEXT: # %bb.2: # %else 2030; KNL_32-NEXT: testb $2, %al 2031; KNL_32-NEXT: jne .LBB24_3 2032; KNL_32-NEXT: .LBB24_4: # %else2 2033; KNL_32-NEXT: vmovdqa %xmm2, %xmm0 2034; KNL_32-NEXT: vzeroupper 2035; KNL_32-NEXT: retl 2036; KNL_32-NEXT: .LBB24_1: # %cond.load 2037; KNL_32-NEXT: vmovd %xmm0, %ecx 2038; KNL_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm2 2039; KNL_32-NEXT: testb $2, %al 2040; KNL_32-NEXT: je .LBB24_4 2041; KNL_32-NEXT: .LBB24_3: # %cond.load1 2042; KNL_32-NEXT: vpextrd $1, %xmm0, %eax 2043; KNL_32-NEXT: vpinsrd $1, (%eax), %xmm2, %xmm2 2044; KNL_32-NEXT: vmovdqa %xmm2, %xmm0 2045; KNL_32-NEXT: vzeroupper 2046; KNL_32-NEXT: retl 2047; 2048; SKX-LABEL: test23b: 2049; SKX: # %bb.0: 2050; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 2051; SKX-NEXT: vpmovq2m %xmm1, %k0 2052; SKX-NEXT: vpsllq $2, %xmm0, %xmm0 2053; SKX-NEXT: vpbroadcastq %rdi, %xmm1 2054; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 2055; SKX-NEXT: kmovw %k0, %eax 2056; SKX-NEXT: testb $1, %al 2057; SKX-NEXT: jne .LBB24_1 2058; SKX-NEXT: # %bb.2: # %else 2059; SKX-NEXT: testb $2, %al 2060; SKX-NEXT: jne .LBB24_3 2061; SKX-NEXT: .LBB24_4: # %else2 2062; SKX-NEXT: vmovdqa %xmm2, %xmm0 2063; SKX-NEXT: retq 2064; SKX-NEXT: .LBB24_1: # %cond.load 2065; SKX-NEXT: vmovq %xmm0, %rcx 2066; SKX-NEXT: vpinsrd $0, (%rcx), %xmm2, %xmm2 2067; SKX-NEXT: testb $2, %al 2068; SKX-NEXT: je .LBB24_4 2069; SKX-NEXT: .LBB24_3: # %cond.load1 2070; SKX-NEXT: vpextrq $1, %xmm0, %rax 2071; SKX-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2 2072; SKX-NEXT: vmovdqa %xmm2, %xmm0 2073; SKX-NEXT: retq 2074; 2075; SKX_32-LABEL: test23b: 2076; SKX_32: # %bb.0: 2077; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 2078; SKX_32-NEXT: vpmovq2m %xmm1, %k0 2079; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2080; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0 2081; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 2082; SKX_32-NEXT: kmovw %k0, %eax 2083; SKX_32-NEXT: testb $1, %al 2084; SKX_32-NEXT: jne .LBB24_1 2085; SKX_32-NEXT: # %bb.2: # %else 2086; SKX_32-NEXT: testb $2, %al 2087; SKX_32-NEXT: jne .LBB24_3 2088; SKX_32-NEXT: .LBB24_4: # %else2 2089; SKX_32-NEXT: vmovdqa %xmm2, %xmm0 2090; SKX_32-NEXT: retl 2091; SKX_32-NEXT: .LBB24_1: # %cond.load 2092; SKX_32-NEXT: vmovd %xmm0, %ecx 2093; SKX_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm2 2094; SKX_32-NEXT: testb $2, %al 2095; SKX_32-NEXT: je .LBB24_4 2096; SKX_32-NEXT: .LBB24_3: # %cond.load1 2097; SKX_32-NEXT: vpextrd $1, %xmm0, %eax 2098; SKX_32-NEXT: vpinsrd $1, (%eax), %xmm2, %xmm2 2099; SKX_32-NEXT: vmovdqa %xmm2, %xmm0 2100; SKX_32-NEXT: retl 2101 %gep.random = getelementptr i32, ptr %base, <2 x i64> %ind 2102 %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0) 2103 ret <2 x i32>%res 2104} 2105 2106define <2 x i32> @test24(ptr %base, <2 x i32> %ind) { 2107; KNL_64-LABEL: test24: 2108; KNL_64: # %bb.0: 2109; KNL_64-NEXT: vpmovsxdq %xmm0, %xmm0 2110; KNL_64-NEXT: vpsllq $2, %xmm0, %xmm0 2111; KNL_64-NEXT: vmovq %rdi, %xmm1 2112; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 2113; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 2114; KNL_64-NEXT: vmovq %xmm0, %rax 2115; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx 2116; KNL_64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2117; KNL_64-NEXT: vpinsrd $1, (%rcx), %xmm0, %xmm0 2118; KNL_64-NEXT: retq 2119; 2120; KNL_32-LABEL: test24: 2121; KNL_32: # %bb.0: 2122; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 2123; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 2124; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 2125; KNL_32-NEXT: vmovd %xmm0, %eax 2126; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx 2127; KNL_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2128; KNL_32-NEXT: vpinsrd $1, (%ecx), %xmm0, %xmm0 2129; KNL_32-NEXT: retl 2130; 2131; SKX-LABEL: test24: 2132; SKX: # %bb.0: 2133; SKX-NEXT: vpmovsxdq %xmm0, %xmm0 2134; SKX-NEXT: vpbroadcastq %rdi, %xmm1 2135; SKX-NEXT: vpsllq $2, %xmm0, %xmm0 2136; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 2137; SKX-NEXT: vmovq %xmm0, %rax 2138; SKX-NEXT: vpextrq $1, %xmm0, %rcx 2139; SKX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2140; SKX-NEXT: vpinsrd $1, (%rcx), %xmm0, %xmm0 2141; SKX-NEXT: retq 2142; 2143; SKX_32-LABEL: test24: 2144; SKX_32: # %bb.0: 2145; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0 2146; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 2147; SKX_32-NEXT: vmovd %xmm0, %eax 2148; SKX_32-NEXT: vpextrd $1, %xmm0, %ecx 2149; SKX_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2150; SKX_32-NEXT: vpinsrd $1, (%ecx), %xmm0, %xmm0 2151; SKX_32-NEXT: retl 2152 %sext_ind = sext <2 x i32> %ind to <2 x i64> 2153 %gep.random = getelementptr i32, ptr %base, <2 x i64> %sext_ind 2154 %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0(<2 x ptr> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) 2155 ret <2 x i32>%res 2156} 2157 2158define <2 x i64> @test25(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %src0) { 2159; KNL_64-LABEL: test25: 2160; KNL_64: # %bb.0: 2161; KNL_64-NEXT: vpsllq $63, %xmm1, %xmm1 2162; KNL_64-NEXT: vptestmq %zmm1, %zmm1, %k0 2163; KNL_64-NEXT: vpmovsxdq %xmm0, %xmm0 2164; KNL_64-NEXT: vpsllq $3, %xmm0, %xmm0 2165; KNL_64-NEXT: vmovq %rdi, %xmm1 2166; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 2167; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 2168; KNL_64-NEXT: kmovw %k0, %eax 2169; KNL_64-NEXT: testb $1, %al 2170; KNL_64-NEXT: jne .LBB26_1 2171; KNL_64-NEXT: # %bb.2: # %else 2172; KNL_64-NEXT: testb $2, %al 2173; KNL_64-NEXT: jne .LBB26_3 2174; KNL_64-NEXT: .LBB26_4: # %else2 2175; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 2176; KNL_64-NEXT: vzeroupper 2177; KNL_64-NEXT: retq 2178; KNL_64-NEXT: .LBB26_1: # %cond.load 2179; KNL_64-NEXT: vmovq %xmm0, %rcx 2180; KNL_64-NEXT: vpinsrq $0, (%rcx), %xmm2, %xmm2 2181; KNL_64-NEXT: testb $2, %al 2182; KNL_64-NEXT: je .LBB26_4 2183; KNL_64-NEXT: .LBB26_3: # %cond.load1 2184; KNL_64-NEXT: vpextrq $1, %xmm0, %rax 2185; KNL_64-NEXT: vpinsrq $1, (%rax), %xmm2, %xmm2 2186; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 2187; KNL_64-NEXT: vzeroupper 2188; KNL_64-NEXT: retq 2189; 2190; KNL_32-LABEL: test25: 2191; KNL_32: # %bb.0: 2192; KNL_32-NEXT: vpsllq $63, %xmm1, %xmm1 2193; KNL_32-NEXT: vptestmq %zmm1, %zmm1, %k0 2194; KNL_32-NEXT: vpslld $3, %xmm0, %xmm0 2195; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 2196; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 2197; KNL_32-NEXT: kmovw %k0, %eax 2198; KNL_32-NEXT: testb $1, %al 2199; KNL_32-NEXT: jne .LBB26_1 2200; KNL_32-NEXT: # %bb.2: # %else 2201; KNL_32-NEXT: testb $2, %al 2202; KNL_32-NEXT: jne .LBB26_3 2203; KNL_32-NEXT: .LBB26_4: # %else2 2204; KNL_32-NEXT: vmovdqa %xmm2, %xmm0 2205; KNL_32-NEXT: vzeroupper 2206; KNL_32-NEXT: retl 2207; KNL_32-NEXT: .LBB26_1: # %cond.load 2208; KNL_32-NEXT: vmovd %xmm0, %ecx 2209; KNL_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm1 2210; KNL_32-NEXT: vpinsrd $1, 4(%ecx), %xmm1, %xmm2 2211; KNL_32-NEXT: testb $2, %al 2212; KNL_32-NEXT: je .LBB26_4 2213; KNL_32-NEXT: .LBB26_3: # %cond.load1 2214; KNL_32-NEXT: vpextrd $1, %xmm0, %eax 2215; KNL_32-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm0 2216; KNL_32-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm2 2217; KNL_32-NEXT: vmovdqa %xmm2, %xmm0 2218; KNL_32-NEXT: vzeroupper 2219; KNL_32-NEXT: retl 2220; 2221; SKX-LABEL: test25: 2222; SKX: # %bb.0: 2223; SKX-NEXT: vpsllq $63, %xmm1, %xmm1 2224; SKX-NEXT: vpmovq2m %xmm1, %k0 2225; SKX-NEXT: vpmovsxdq %xmm0, %xmm0 2226; SKX-NEXT: vpbroadcastq %rdi, %xmm1 2227; SKX-NEXT: vpsllq $3, %xmm0, %xmm0 2228; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 2229; SKX-NEXT: kmovw %k0, %eax 2230; SKX-NEXT: testb $1, %al 2231; SKX-NEXT: jne .LBB26_1 2232; SKX-NEXT: # %bb.2: # %else 2233; SKX-NEXT: testb $2, %al 2234; SKX-NEXT: jne .LBB26_3 2235; SKX-NEXT: .LBB26_4: # %else2 2236; SKX-NEXT: vmovdqa %xmm2, %xmm0 2237; SKX-NEXT: retq 2238; SKX-NEXT: .LBB26_1: # %cond.load 2239; SKX-NEXT: vmovq %xmm0, %rcx 2240; SKX-NEXT: vpinsrq $0, (%rcx), %xmm2, %xmm2 2241; SKX-NEXT: testb $2, %al 2242; SKX-NEXT: je .LBB26_4 2243; SKX-NEXT: .LBB26_3: # %cond.load1 2244; SKX-NEXT: vpextrq $1, %xmm0, %rax 2245; SKX-NEXT: vpinsrq $1, (%rax), %xmm2, %xmm2 2246; SKX-NEXT: vmovdqa %xmm2, %xmm0 2247; SKX-NEXT: retq 2248; 2249; SKX_32-LABEL: test25: 2250; SKX_32: # %bb.0: 2251; SKX_32-NEXT: vpsllq $63, %xmm1, %xmm1 2252; SKX_32-NEXT: vpmovq2m %xmm1, %k0 2253; SKX_32-NEXT: vpslld $3, %xmm0, %xmm0 2254; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 2255; SKX_32-NEXT: kmovw %k0, %eax 2256; SKX_32-NEXT: testb $1, %al 2257; SKX_32-NEXT: jne .LBB26_1 2258; SKX_32-NEXT: # %bb.2: # %else 2259; SKX_32-NEXT: testb $2, %al 2260; SKX_32-NEXT: jne .LBB26_3 2261; SKX_32-NEXT: .LBB26_4: # %else2 2262; SKX_32-NEXT: vmovdqa %xmm2, %xmm0 2263; SKX_32-NEXT: retl 2264; SKX_32-NEXT: .LBB26_1: # %cond.load 2265; SKX_32-NEXT: vmovd %xmm0, %ecx 2266; SKX_32-NEXT: vpinsrd $0, (%ecx), %xmm2, %xmm1 2267; SKX_32-NEXT: vpinsrd $1, 4(%ecx), %xmm1, %xmm2 2268; SKX_32-NEXT: testb $2, %al 2269; SKX_32-NEXT: je .LBB26_4 2270; SKX_32-NEXT: .LBB26_3: # %cond.load1 2271; SKX_32-NEXT: vpextrd $1, %xmm0, %eax 2272; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm0 2273; SKX_32-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm2 2274; SKX_32-NEXT: vmovdqa %xmm2, %xmm0 2275; SKX_32-NEXT: retl 2276 %sext_ind = sext <2 x i32> %ind to <2 x i64> 2277 %gep.random = getelementptr i64, ptr %base, <2 x i64> %sext_ind 2278 %res = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0) 2279 ret <2 x i64>%res 2280} 2281 2282define <2 x i64> @test26(ptr %base, <2 x i32> %ind, <2 x i64> %src0) { 2283; KNL_64-LABEL: test26: 2284; KNL_64: # %bb.0: 2285; KNL_64-NEXT: vpmovsxdq %xmm0, %xmm0 2286; KNL_64-NEXT: vpsllq $3, %xmm0, %xmm0 2287; KNL_64-NEXT: vmovq %rdi, %xmm1 2288; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 2289; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 2290; KNL_64-NEXT: vmovq %xmm0, %rax 2291; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx 2292; KNL_64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 2293; KNL_64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 2294; KNL_64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2295; KNL_64-NEXT: retq 2296; 2297; KNL_32-LABEL: test26: 2298; KNL_32: # %bb.0: 2299; KNL_32-NEXT: vpslld $3, %xmm0, %xmm0 2300; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 2301; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 2302; KNL_32-NEXT: vmovd %xmm0, %eax 2303; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx 2304; KNL_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2305; KNL_32-NEXT: vpinsrd $1, 4(%eax), %xmm0, %xmm0 2306; KNL_32-NEXT: vpinsrd $2, (%ecx), %xmm0, %xmm0 2307; KNL_32-NEXT: vpinsrd $3, 4(%ecx), %xmm0, %xmm0 2308; KNL_32-NEXT: retl 2309; 2310; SKX-LABEL: test26: 2311; SKX: # %bb.0: 2312; SKX-NEXT: vpmovsxdq %xmm0, %xmm0 2313; SKX-NEXT: vpbroadcastq %rdi, %xmm1 2314; SKX-NEXT: vpsllq $3, %xmm0, %xmm0 2315; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 2316; SKX-NEXT: vmovq %xmm0, %rax 2317; SKX-NEXT: vpextrq $1, %xmm0, %rcx 2318; SKX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 2319; SKX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 2320; SKX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] 2321; SKX-NEXT: retq 2322; 2323; SKX_32-LABEL: test26: 2324; SKX_32: # %bb.0: 2325; SKX_32-NEXT: vpslld $3, %xmm0, %xmm0 2326; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 2327; SKX_32-NEXT: vmovd %xmm0, %eax 2328; SKX_32-NEXT: vpextrd $1, %xmm0, %ecx 2329; SKX_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2330; SKX_32-NEXT: vpinsrd $1, 4(%eax), %xmm0, %xmm0 2331; SKX_32-NEXT: vpinsrd $2, (%ecx), %xmm0, %xmm0 2332; SKX_32-NEXT: vpinsrd $3, 4(%ecx), %xmm0, %xmm0 2333; SKX_32-NEXT: retl 2334 %sext_ind = sext <2 x i32> %ind to <2 x i64> 2335 %gep.random = getelementptr i64, ptr %base, <2 x i64> %sext_ind 2336 %res = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> %gep.random, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %src0) 2337 ret <2 x i64>%res 2338} 2339 2340; Result type requires widening; all-ones mask 2341define <2 x float> @test27(ptr %base, <2 x i32> %ind) { 2342; KNL_64-LABEL: test27: 2343; KNL_64: # %bb.0: 2344; KNL_64-NEXT: vpmovsxdq %xmm0, %xmm0 2345; KNL_64-NEXT: vpsllq $2, %xmm0, %xmm0 2346; KNL_64-NEXT: vmovq %rdi, %xmm1 2347; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1 2348; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0 2349; KNL_64-NEXT: vmovq %xmm0, %rax 2350; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx 2351; KNL_64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2352; KNL_64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] 2353; KNL_64-NEXT: retq 2354; 2355; KNL_32-LABEL: test27: 2356; KNL_32: # %bb.0: 2357; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 2358; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1 2359; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0 2360; KNL_32-NEXT: vmovd %xmm0, %eax 2361; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx 2362; KNL_32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2363; KNL_32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] 2364; KNL_32-NEXT: retl 2365; 2366; SKX-LABEL: test27: 2367; SKX: # %bb.0: 2368; SKX-NEXT: vpmovsxdq %xmm0, %xmm0 2369; SKX-NEXT: vpbroadcastq %rdi, %xmm1 2370; SKX-NEXT: vpsllq $2, %xmm0, %xmm0 2371; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 2372; SKX-NEXT: vmovq %xmm0, %rax 2373; SKX-NEXT: vpextrq $1, %xmm0, %rcx 2374; SKX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2375; SKX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] 2376; SKX-NEXT: retq 2377; 2378; SKX_32-LABEL: test27: 2379; SKX_32: # %bb.0: 2380; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0 2381; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 2382; SKX_32-NEXT: vmovd %xmm0, %eax 2383; SKX_32-NEXT: vpextrd $1, %xmm0, %ecx 2384; SKX_32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 2385; SKX_32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] 2386; SKX_32-NEXT: retl 2387 %sext_ind = sext <2 x i32> %ind to <2 x i64> 2388 %gep.random = getelementptr float, ptr %base, <2 x i64> %sext_ind 2389 %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef) 2390 ret <2 x float>%res 2391} 2392 2393; Data type requires promotion, mask is all-ones 2394define void @test28(<2 x i32>%a1, <2 x ptr> %ptr) { 2395; KNL_64-LABEL: test28: 2396; KNL_64: # %bb.0: 2397; KNL_64-NEXT: vmovq %xmm1, %rax 2398; KNL_64-NEXT: vmovss %xmm0, (%rax) 2399; KNL_64-NEXT: vpextrq $1, %xmm1, %rax 2400; KNL_64-NEXT: vextractps $1, %xmm0, (%rax) 2401; KNL_64-NEXT: retq 2402; 2403; KNL_32-LABEL: test28: 2404; KNL_32: # %bb.0: 2405; KNL_32-NEXT: vmovd %xmm1, %eax 2406; KNL_32-NEXT: vmovss %xmm0, (%eax) 2407; KNL_32-NEXT: vpextrd $1, %xmm1, %eax 2408; KNL_32-NEXT: vextractps $1, %xmm0, (%eax) 2409; KNL_32-NEXT: retl 2410; 2411; SKX-LABEL: test28: 2412; SKX: # %bb.0: 2413; SKX-NEXT: vmovq %xmm1, %rax 2414; SKX-NEXT: vmovss %xmm0, (%rax) 2415; SKX-NEXT: vpextrq $1, %xmm1, %rax 2416; SKX-NEXT: vextractps $1, %xmm0, (%rax) 2417; SKX-NEXT: retq 2418; 2419; SKX_32-LABEL: test28: 2420; SKX_32: # %bb.0: 2421; SKX_32-NEXT: vmovd %xmm1, %eax 2422; SKX_32-NEXT: vmovss %xmm0, (%eax) 2423; SKX_32-NEXT: vpextrd $1, %xmm1, %eax 2424; SKX_32-NEXT: vextractps $1, %xmm0, (%eax) 2425; SKX_32-NEXT: retl 2426 call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> %a1, <2 x ptr> %ptr, i32 4, <2 x i1> <i1 true, i1 true>) 2427 ret void 2428} 2429 2430; SCALAR-LABEL: test29 2431; SCALAR: extractelement <16 x ptr> 2432; SCALAR-NEXT: load float 2433; SCALAR-NEXT: insertelement <16 x float> 2434; SCALAR-NEXT: extractelement <16 x ptr> 2435; SCALAR-NEXT: load float 2436 2437define <16 x float> @test29(ptr %base, <16 x i32> %ind) { 2438; KNL_64-LABEL: test29: 2439; KNL_64: # %bb.0: 2440; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1 2441; KNL_64-NEXT: movw $44, %ax 2442; KNL_64-NEXT: kmovw %eax, %k1 2443; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 2444; KNL_64-NEXT: vmovaps %zmm1, %zmm0 2445; KNL_64-NEXT: retq 2446; 2447; KNL_32-LABEL: test29: 2448; KNL_32: # %bb.0: 2449; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 2450; KNL_32-NEXT: vxorps %xmm1, %xmm1, %xmm1 2451; KNL_32-NEXT: movw $44, %cx 2452; KNL_32-NEXT: kmovw %ecx, %k1 2453; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} 2454; KNL_32-NEXT: vmovaps %zmm1, %zmm0 2455; KNL_32-NEXT: retl 2456; 2457; SKX-LABEL: test29: 2458; SKX: # %bb.0: 2459; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 2460; SKX-NEXT: movw $44, %ax 2461; SKX-NEXT: kmovw %eax, %k1 2462; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 2463; SKX-NEXT: vmovaps %zmm1, %zmm0 2464; SKX-NEXT: retq 2465; 2466; SKX_32-LABEL: test29: 2467; SKX_32: # %bb.0: 2468; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 2469; SKX_32-NEXT: vxorps %xmm1, %xmm1, %xmm1 2470; SKX_32-NEXT: movw $44, %cx 2471; SKX_32-NEXT: kmovw %ecx, %k1 2472; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} 2473; SKX_32-NEXT: vmovaps %zmm1, %zmm0 2474; SKX_32-NEXT: retl 2475 2476 %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 2477 %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer 2478 2479 %sext_ind = sext <16 x i32> %ind to <16 x i64> 2480 %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind 2481 2482 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> undef) 2483 ret <16 x float>%res 2484} 2485 2486declare <3 x i32> @llvm.masked.gather.v3i32.v3p0(<3 x ptr>, i32, <3 x i1>, <3 x i32>) 2487define <3 x i32> @test30(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) { 2488; KNL_64-LABEL: test30: 2489; KNL_64: # %bb.0: 2490; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 2491; KNL_64-NEXT: andl $1, %edi 2492; KNL_64-NEXT: kmovw %edi, %k0 2493; KNL_64-NEXT: kmovw %esi, %k1 2494; KNL_64-NEXT: kshiftlw $15, %k1, %k1 2495; KNL_64-NEXT: kshiftrw $14, %k1, %k1 2496; KNL_64-NEXT: korw %k1, %k0, %k0 2497; KNL_64-NEXT: movw $-5, %ax 2498; KNL_64-NEXT: kmovw %eax, %k1 2499; KNL_64-NEXT: kandw %k1, %k0, %k0 2500; KNL_64-NEXT: kmovw %edx, %k1 2501; KNL_64-NEXT: kshiftlw $15, %k1, %k1 2502; KNL_64-NEXT: kshiftrw $13, %k1, %k1 2503; KNL_64-NEXT: korw %k1, %k0, %k0 2504; KNL_64-NEXT: movb $7, %al 2505; KNL_64-NEXT: kmovw %eax, %k1 2506; KNL_64-NEXT: kandw %k1, %k0, %k0 2507; KNL_64-NEXT: kshiftlw $12, %k0, %k0 2508; KNL_64-NEXT: kshiftrw $12, %k0, %k1 2509; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 2510; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1 2511; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm0 2512; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm2 {%k1} 2513; KNL_64-NEXT: vmovdqa %xmm2, %xmm0 2514; KNL_64-NEXT: vzeroupper 2515; KNL_64-NEXT: retq 2516; 2517; KNL_32-LABEL: test30: 2518; KNL_32: # %bb.0: 2519; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 2520; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2521; KNL_32-NEXT: andl $1, %eax 2522; KNL_32-NEXT: kmovw %eax, %k0 2523; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2524; KNL_32-NEXT: kmovw %eax, %k1 2525; KNL_32-NEXT: kshiftlw $15, %k1, %k1 2526; KNL_32-NEXT: kshiftrw $14, %k1, %k1 2527; KNL_32-NEXT: korw %k1, %k0, %k0 2528; KNL_32-NEXT: movw $-5, %ax 2529; KNL_32-NEXT: kmovw %eax, %k1 2530; KNL_32-NEXT: kandw %k1, %k0, %k0 2531; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2532; KNL_32-NEXT: kmovw %eax, %k1 2533; KNL_32-NEXT: kshiftlw $15, %k1, %k1 2534; KNL_32-NEXT: kshiftrw $13, %k1, %k1 2535; KNL_32-NEXT: korw %k1, %k0, %k0 2536; KNL_32-NEXT: movb $7, %al 2537; KNL_32-NEXT: kmovw %eax, %k1 2538; KNL_32-NEXT: kandw %k1, %k0, %k0 2539; KNL_32-NEXT: kshiftlw $12, %k0, %k0 2540; KNL_32-NEXT: kshiftrw $12, %k0, %k1 2541; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1 2542; KNL_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2543; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1} 2544; KNL_32-NEXT: vmovdqa %xmm2, %xmm0 2545; KNL_32-NEXT: vzeroupper 2546; KNL_32-NEXT: retl 2547; 2548; SKX-LABEL: test30: 2549; SKX: # %bb.0: 2550; SKX-NEXT: kmovw %esi, %k0 2551; SKX-NEXT: kshiftlb $7, %k0, %k0 2552; SKX-NEXT: kshiftrb $6, %k0, %k0 2553; SKX-NEXT: kmovw %edi, %k1 2554; SKX-NEXT: kshiftlb $7, %k1, %k1 2555; SKX-NEXT: kshiftrb $7, %k1, %k1 2556; SKX-NEXT: korw %k0, %k1, %k0 2557; SKX-NEXT: movb $-5, %al 2558; SKX-NEXT: kmovw %eax, %k1 2559; SKX-NEXT: kandw %k1, %k0, %k0 2560; SKX-NEXT: kmovw %edx, %k1 2561; SKX-NEXT: kshiftlb $7, %k1, %k1 2562; SKX-NEXT: kshiftrb $5, %k1, %k1 2563; SKX-NEXT: korw %k1, %k0, %k0 2564; SKX-NEXT: movb $7, %al 2565; SKX-NEXT: kmovw %eax, %k1 2566; SKX-NEXT: kandw %k1, %k0, %k1 2567; SKX-NEXT: vpmovsxdq %xmm1, %ymm1 2568; SKX-NEXT: vpsllq $2, %ymm1, %ymm1 2569; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0 2570; SKX-NEXT: vpgatherqd (,%ymm0), %xmm2 {%k1} 2571; SKX-NEXT: vmovdqa %xmm2, %xmm0 2572; SKX-NEXT: vzeroupper 2573; SKX-NEXT: retq 2574; 2575; SKX_32-LABEL: test30: 2576; SKX_32: # %bb.0: 2577; SKX_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2578; SKX_32-NEXT: kmovw %eax, %k0 2579; SKX_32-NEXT: kshiftlb $7, %k0, %k0 2580; SKX_32-NEXT: kshiftrb $6, %k0, %k0 2581; SKX_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2582; SKX_32-NEXT: kmovw %eax, %k1 2583; SKX_32-NEXT: kshiftlb $7, %k1, %k1 2584; SKX_32-NEXT: kshiftrb $7, %k1, %k1 2585; SKX_32-NEXT: korw %k0, %k1, %k0 2586; SKX_32-NEXT: movb $-5, %al 2587; SKX_32-NEXT: kmovw %eax, %k1 2588; SKX_32-NEXT: kandw %k1, %k0, %k0 2589; SKX_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2590; SKX_32-NEXT: kmovw %eax, %k1 2591; SKX_32-NEXT: kshiftlb $7, %k1, %k1 2592; SKX_32-NEXT: kshiftrb $5, %k1, %k1 2593; SKX_32-NEXT: korw %k1, %k0, %k0 2594; SKX_32-NEXT: movb $7, %al 2595; SKX_32-NEXT: kmovw %eax, %k1 2596; SKX_32-NEXT: kandw %k1, %k0, %k1 2597; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1 2598; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2599; SKX_32-NEXT: vpgatherdd (,%xmm0), %xmm2 {%k1} 2600; SKX_32-NEXT: vmovdqa %xmm2, %xmm0 2601; SKX_32-NEXT: retl 2602 2603 %sext_ind = sext <3 x i32> %ind to <3 x i64> 2604 %gep.random = getelementptr i32, <3 x ptr> %base, <3 x i64> %sext_ind 2605 %res = call <3 x i32> @llvm.masked.gather.v3i32.v3p0(<3 x ptr> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0) 2606 ret <3 x i32>%res 2607} 2608 2609; Non-power of 2 scatter 2610declare void @llvm.masked.scatter.v3i32.v3p0(<3 x i32>, <3 x ptr>, i32, <3 x i1>) 2611define void @test30b(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) { 2612; KNL_64-LABEL: test30b: 2613; KNL_64: # %bb.0: 2614; KNL_64-NEXT: # kill: def $xmm2 killed $xmm2 def $ymm2 2615; KNL_64-NEXT: andl $1, %edi 2616; KNL_64-NEXT: kmovw %edi, %k0 2617; KNL_64-NEXT: kmovw %esi, %k1 2618; KNL_64-NEXT: kshiftlw $15, %k1, %k1 2619; KNL_64-NEXT: kshiftrw $14, %k1, %k1 2620; KNL_64-NEXT: korw %k1, %k0, %k0 2621; KNL_64-NEXT: movw $-5, %ax 2622; KNL_64-NEXT: kmovw %eax, %k1 2623; KNL_64-NEXT: kandw %k1, %k0, %k0 2624; KNL_64-NEXT: kmovw %edx, %k1 2625; KNL_64-NEXT: kshiftlw $15, %k1, %k1 2626; KNL_64-NEXT: kshiftrw $13, %k1, %k1 2627; KNL_64-NEXT: korw %k1, %k0, %k0 2628; KNL_64-NEXT: movb $7, %al 2629; KNL_64-NEXT: kmovw %eax, %k1 2630; KNL_64-NEXT: kandw %k1, %k0, %k0 2631; KNL_64-NEXT: kshiftlw $12, %k0, %k0 2632; KNL_64-NEXT: kshiftrw $12, %k0, %k1 2633; KNL_64-NEXT: vpmovsxdq %xmm1, %ymm1 2634; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1 2635; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm0 2636; KNL_64-NEXT: vpscatterqd %ymm2, (,%zmm0) {%k1} 2637; KNL_64-NEXT: vzeroupper 2638; KNL_64-NEXT: retq 2639; 2640; KNL_32-LABEL: test30b: 2641; KNL_32: # %bb.0: 2642; KNL_32-NEXT: # kill: def $xmm2 killed $xmm2 def $zmm2 2643; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2644; KNL_32-NEXT: andl $1, %eax 2645; KNL_32-NEXT: kmovw %eax, %k0 2646; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2647; KNL_32-NEXT: kmovw %eax, %k1 2648; KNL_32-NEXT: kshiftlw $15, %k1, %k1 2649; KNL_32-NEXT: kshiftrw $14, %k1, %k1 2650; KNL_32-NEXT: korw %k1, %k0, %k0 2651; KNL_32-NEXT: movw $-5, %ax 2652; KNL_32-NEXT: kmovw %eax, %k1 2653; KNL_32-NEXT: kandw %k1, %k0, %k0 2654; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2655; KNL_32-NEXT: kmovw %eax, %k1 2656; KNL_32-NEXT: kshiftlw $15, %k1, %k1 2657; KNL_32-NEXT: kshiftrw $13, %k1, %k1 2658; KNL_32-NEXT: korw %k1, %k0, %k0 2659; KNL_32-NEXT: movb $7, %al 2660; KNL_32-NEXT: kmovw %eax, %k1 2661; KNL_32-NEXT: kandw %k1, %k0, %k0 2662; KNL_32-NEXT: kshiftlw $12, %k0, %k0 2663; KNL_32-NEXT: kshiftrw $12, %k0, %k1 2664; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1 2665; KNL_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2666; KNL_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1} 2667; KNL_32-NEXT: vzeroupper 2668; KNL_32-NEXT: retl 2669; 2670; SKX-LABEL: test30b: 2671; SKX: # %bb.0: 2672; SKX-NEXT: kmovw %esi, %k0 2673; SKX-NEXT: kshiftlb $7, %k0, %k0 2674; SKX-NEXT: kshiftrb $6, %k0, %k0 2675; SKX-NEXT: kmovw %edi, %k1 2676; SKX-NEXT: kshiftlb $7, %k1, %k1 2677; SKX-NEXT: kshiftrb $7, %k1, %k1 2678; SKX-NEXT: korw %k0, %k1, %k0 2679; SKX-NEXT: movb $-5, %al 2680; SKX-NEXT: kmovw %eax, %k1 2681; SKX-NEXT: kandw %k1, %k0, %k0 2682; SKX-NEXT: kmovw %edx, %k1 2683; SKX-NEXT: kshiftlb $7, %k1, %k1 2684; SKX-NEXT: kshiftrb $5, %k1, %k1 2685; SKX-NEXT: korw %k1, %k0, %k0 2686; SKX-NEXT: movb $7, %al 2687; SKX-NEXT: kmovw %eax, %k1 2688; SKX-NEXT: kandw %k1, %k0, %k1 2689; SKX-NEXT: vpmovsxdq %xmm1, %ymm1 2690; SKX-NEXT: vpsllq $2, %ymm1, %ymm1 2691; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0 2692; SKX-NEXT: vpscatterqd %xmm2, (,%ymm0) {%k1} 2693; SKX-NEXT: vzeroupper 2694; SKX-NEXT: retq 2695; 2696; SKX_32-LABEL: test30b: 2697; SKX_32: # %bb.0: 2698; SKX_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2699; SKX_32-NEXT: kmovw %eax, %k0 2700; SKX_32-NEXT: kshiftlb $7, %k0, %k0 2701; SKX_32-NEXT: kshiftrb $6, %k0, %k0 2702; SKX_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2703; SKX_32-NEXT: kmovw %eax, %k1 2704; SKX_32-NEXT: kshiftlb $7, %k1, %k1 2705; SKX_32-NEXT: kshiftrb $7, %k1, %k1 2706; SKX_32-NEXT: korw %k0, %k1, %k0 2707; SKX_32-NEXT: movb $-5, %al 2708; SKX_32-NEXT: kmovw %eax, %k1 2709; SKX_32-NEXT: kandw %k1, %k0, %k0 2710; SKX_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax 2711; SKX_32-NEXT: kmovw %eax, %k1 2712; SKX_32-NEXT: kshiftlb $7, %k1, %k1 2713; SKX_32-NEXT: kshiftrb $5, %k1, %k1 2714; SKX_32-NEXT: korw %k1, %k0, %k0 2715; SKX_32-NEXT: movb $7, %al 2716; SKX_32-NEXT: kmovw %eax, %k1 2717; SKX_32-NEXT: kandw %k1, %k0, %k1 2718; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1 2719; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0 2720; SKX_32-NEXT: vpscatterdd %xmm2, (,%xmm0) {%k1} 2721; SKX_32-NEXT: retl 2722 %sext_ind = sext <3 x i32> %ind to <3 x i64> 2723 %gep.random = getelementptr i32, <3 x ptr> %base, <3 x i64> %sext_ind 2724 call void @llvm.masked.scatter.v3i32.v3p0(<3 x i32> %src0, <3 x ptr> %gep.random, i32 4, <3 x i1> %mask) 2725 ret void 2726} 2727 2728declare <16 x ptr> @llvm.masked.gather.v16p0.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x ptr>) 2729define <16 x ptr> @test31(<16 x ptr> %ptrs) { 2730; KNL_64-LABEL: test31: 2731; KNL_64: # %bb.0: 2732; KNL_64-NEXT: kxnorw %k0, %k0, %k1 2733; KNL_64-NEXT: vpxor %xmm2, %xmm2, %xmm2 2734; KNL_64-NEXT: vpxor %xmm3, %xmm3, %xmm3 2735; KNL_64-NEXT: kxnorw %k0, %k0, %k2 2736; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k2} 2737; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm2 {%k1} 2738; KNL_64-NEXT: vmovdqa64 %zmm3, %zmm0 2739; KNL_64-NEXT: vmovdqa64 %zmm2, %zmm1 2740; KNL_64-NEXT: retq 2741; 2742; KNL_32-LABEL: test31: 2743; KNL_32: # %bb.0: 2744; KNL_32-NEXT: kxnorw %k0, %k0, %k1 2745; KNL_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 2746; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1} 2747; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0 2748; KNL_32-NEXT: retl 2749; 2750; SKX-LABEL: test31: 2751; SKX: # %bb.0: 2752; SKX-NEXT: kxnorw %k0, %k0, %k1 2753; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2 2754; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3 2755; SKX-NEXT: kxnorw %k0, %k0, %k2 2756; SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k2} 2757; SKX-NEXT: vpgatherqq (,%zmm1), %zmm2 {%k1} 2758; SKX-NEXT: vmovdqa64 %zmm3, %zmm0 2759; SKX-NEXT: vmovdqa64 %zmm2, %zmm1 2760; SKX-NEXT: retq 2761; 2762; SKX_32-LABEL: test31: 2763; SKX_32: # %bb.0: 2764; SKX_32-NEXT: kxnorw %k0, %k0, %k1 2765; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 2766; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k1} 2767; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm0 2768; SKX_32-NEXT: retl 2769 2770 %res = call <16 x ptr> @llvm.masked.gather.v16p0.v16p0(<16 x ptr> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x ptr> undef) 2771 ret <16 x ptr>%res 2772} 2773 2774define <16 x i32> @test_gather_16i32(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x i32> %src0) { 2775; KNL_64-LABEL: test_gather_16i32: 2776; KNL_64: # %bb.0: 2777; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 2778; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 2779; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 2780; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm2 2781; KNL_64-NEXT: kshiftrw $8, %k1, %k2 2782; KNL_64-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} 2783; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1} 2784; KNL_64-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0 2785; KNL_64-NEXT: retq 2786; 2787; KNL_32-LABEL: test_gather_16i32: 2788; KNL_32: # %bb.0: 2789; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 2790; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 2791; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 2792; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1} 2793; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0 2794; KNL_32-NEXT: retl 2795; 2796; SKX-LABEL: test_gather_16i32: 2797; SKX: # %bb.0: 2798; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 2799; SKX-NEXT: vpslld $31, %zmm2, %zmm2 2800; SKX-NEXT: vpmovd2m %zmm2, %k1 2801; SKX-NEXT: vextracti64x4 $1, %zmm3, %ymm2 2802; SKX-NEXT: kshiftrw $8, %k1, %k2 2803; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2} 2804; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1} 2805; SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0 2806; SKX-NEXT: retq 2807; 2808; SKX_32-LABEL: test_gather_16i32: 2809; SKX_32: # %bb.0: 2810; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 2811; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 2812; SKX_32-NEXT: vpmovd2m %zmm1, %k1 2813; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1} 2814; SKX_32-NEXT: vmovdqa64 %zmm2, %zmm0 2815; SKX_32-NEXT: retl 2816 %res = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0) 2817 ret <16 x i32> %res 2818} 2819define <16 x i64> @test_gather_16i64(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x i64> %src0) { 2820; KNL_64-LABEL: test_gather_16i64: 2821; KNL_64: # %bb.0: 2822; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 2823; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 2824; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 2825; KNL_64-NEXT: kshiftrw $8, %k1, %k2 2826; KNL_64-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1} 2827; KNL_64-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2} 2828; KNL_64-NEXT: vmovdqa64 %zmm3, %zmm0 2829; KNL_64-NEXT: vmovdqa64 %zmm4, %zmm1 2830; KNL_64-NEXT: retq 2831; 2832; KNL_32-LABEL: test_gather_16i64: 2833; KNL_32: # %bb.0: 2834; KNL_32-NEXT: pushl %ebp 2835; KNL_32-NEXT: .cfi_def_cfa_offset 8 2836; KNL_32-NEXT: .cfi_offset %ebp, -8 2837; KNL_32-NEXT: movl %esp, %ebp 2838; KNL_32-NEXT: .cfi_def_cfa_register %ebp 2839; KNL_32-NEXT: andl $-64, %esp 2840; KNL_32-NEXT: subl $64, %esp 2841; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 2842; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 2843; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 2844; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1 2845; KNL_32-NEXT: kshiftrw $8, %k1, %k2 2846; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1} 2847; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0 2848; KNL_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2} 2849; KNL_32-NEXT: vmovdqa64 %zmm2, %zmm0 2850; KNL_32-NEXT: movl %ebp, %esp 2851; KNL_32-NEXT: popl %ebp 2852; KNL_32-NEXT: .cfi_def_cfa %esp, 4 2853; KNL_32-NEXT: retl 2854; 2855; SKX-LABEL: test_gather_16i64: 2856; SKX: # %bb.0: 2857; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 2858; SKX-NEXT: vpslld $31, %zmm2, %zmm2 2859; SKX-NEXT: vpmovd2m %zmm2, %k1 2860; SKX-NEXT: kshiftrw $8, %k1, %k2 2861; SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1} 2862; SKX-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2} 2863; SKX-NEXT: vmovdqa64 %zmm3, %zmm0 2864; SKX-NEXT: vmovdqa64 %zmm4, %zmm1 2865; SKX-NEXT: retq 2866; 2867; SKX_32-LABEL: test_gather_16i64: 2868; SKX_32: # %bb.0: 2869; SKX_32-NEXT: pushl %ebp 2870; SKX_32-NEXT: .cfi_def_cfa_offset 8 2871; SKX_32-NEXT: .cfi_offset %ebp, -8 2872; SKX_32-NEXT: movl %esp, %ebp 2873; SKX_32-NEXT: .cfi_def_cfa_register %ebp 2874; SKX_32-NEXT: andl $-64, %esp 2875; SKX_32-NEXT: subl $64, %esp 2876; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 2877; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 2878; SKX_32-NEXT: vpmovd2m %zmm1, %k1 2879; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1 2880; SKX_32-NEXT: kshiftrw $8, %k1, %k2 2881; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1} 2882; SKX_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0 2883; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2} 2884; SKX_32-NEXT: vmovdqa64 %zmm2, %zmm0 2885; SKX_32-NEXT: movl %ebp, %esp 2886; SKX_32-NEXT: popl %ebp 2887; SKX_32-NEXT: .cfi_def_cfa %esp, 4 2888; SKX_32-NEXT: retl 2889 %res = call <16 x i64> @llvm.masked.gather.v16i64.v16p0(<16 x ptr> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0) 2890 ret <16 x i64> %res 2891} 2892declare <16 x i64> @llvm.masked.gather.v16i64.v16p0(<16 x ptr> %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0) 2893define <16 x float> @test_gather_16f32(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x float> %src0) { 2894; KNL_64-LABEL: test_gather_16f32: 2895; KNL_64: # %bb.0: 2896; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 2897; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 2898; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 2899; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm2 2900; KNL_64-NEXT: kshiftrw $8, %k1, %k2 2901; KNL_64-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2} 2902; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1} 2903; KNL_64-NEXT: vinsertf64x4 $1, %ymm2, %zmm3, %zmm0 2904; KNL_64-NEXT: retq 2905; 2906; KNL_32-LABEL: test_gather_16f32: 2907; KNL_32: # %bb.0: 2908; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 2909; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 2910; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 2911; KNL_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1} 2912; KNL_32-NEXT: vmovaps %zmm2, %zmm0 2913; KNL_32-NEXT: retl 2914; 2915; SKX-LABEL: test_gather_16f32: 2916; SKX: # %bb.0: 2917; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 2918; SKX-NEXT: vpslld $31, %zmm2, %zmm2 2919; SKX-NEXT: vpmovd2m %zmm2, %k1 2920; SKX-NEXT: vextractf64x4 $1, %zmm3, %ymm2 2921; SKX-NEXT: kshiftrw $8, %k1, %k2 2922; SKX-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2} 2923; SKX-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1} 2924; SKX-NEXT: vinsertf64x4 $1, %ymm2, %zmm3, %zmm0 2925; SKX-NEXT: retq 2926; 2927; SKX_32-LABEL: test_gather_16f32: 2928; SKX_32: # %bb.0: 2929; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 2930; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 2931; SKX_32-NEXT: vpmovd2m %zmm1, %k1 2932; SKX_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1} 2933; SKX_32-NEXT: vmovaps %zmm2, %zmm0 2934; SKX_32-NEXT: retl 2935 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0) 2936 ret <16 x float> %res 2937} 2938define <16 x double> @test_gather_16f64(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x double> %src0) { 2939; KNL_64-LABEL: test_gather_16f64: 2940; KNL_64: # %bb.0: 2941; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 2942; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 2943; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 2944; KNL_64-NEXT: kshiftrw $8, %k1, %k2 2945; KNL_64-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1} 2946; KNL_64-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2} 2947; KNL_64-NEXT: vmovapd %zmm3, %zmm0 2948; KNL_64-NEXT: vmovapd %zmm4, %zmm1 2949; KNL_64-NEXT: retq 2950; 2951; KNL_32-LABEL: test_gather_16f64: 2952; KNL_32: # %bb.0: 2953; KNL_32-NEXT: pushl %ebp 2954; KNL_32-NEXT: .cfi_def_cfa_offset 8 2955; KNL_32-NEXT: .cfi_offset %ebp, -8 2956; KNL_32-NEXT: movl %esp, %ebp 2957; KNL_32-NEXT: .cfi_def_cfa_register %ebp 2958; KNL_32-NEXT: andl $-64, %esp 2959; KNL_32-NEXT: subl $64, %esp 2960; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 2961; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 2962; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 2963; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1 2964; KNL_32-NEXT: kshiftrw $8, %k1, %k2 2965; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1} 2966; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0 2967; KNL_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2} 2968; KNL_32-NEXT: vmovapd %zmm2, %zmm0 2969; KNL_32-NEXT: movl %ebp, %esp 2970; KNL_32-NEXT: popl %ebp 2971; KNL_32-NEXT: .cfi_def_cfa %esp, 4 2972; KNL_32-NEXT: retl 2973; 2974; SKX-LABEL: test_gather_16f64: 2975; SKX: # %bb.0: 2976; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 2977; SKX-NEXT: vpslld $31, %zmm2, %zmm2 2978; SKX-NEXT: vpmovd2m %zmm2, %k1 2979; SKX-NEXT: kshiftrw $8, %k1, %k2 2980; SKX-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1} 2981; SKX-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2} 2982; SKX-NEXT: vmovapd %zmm3, %zmm0 2983; SKX-NEXT: vmovapd %zmm4, %zmm1 2984; SKX-NEXT: retq 2985; 2986; SKX_32-LABEL: test_gather_16f64: 2987; SKX_32: # %bb.0: 2988; SKX_32-NEXT: pushl %ebp 2989; SKX_32-NEXT: .cfi_def_cfa_offset 8 2990; SKX_32-NEXT: .cfi_offset %ebp, -8 2991; SKX_32-NEXT: movl %esp, %ebp 2992; SKX_32-NEXT: .cfi_def_cfa_register %ebp 2993; SKX_32-NEXT: andl $-64, %esp 2994; SKX_32-NEXT: subl $64, %esp 2995; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 2996; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 2997; SKX_32-NEXT: vpmovd2m %zmm1, %k1 2998; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1 2999; SKX_32-NEXT: kshiftrw $8, %k1, %k2 3000; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1} 3001; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0 3002; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2} 3003; SKX_32-NEXT: vmovapd %zmm2, %zmm0 3004; SKX_32-NEXT: movl %ebp, %esp 3005; SKX_32-NEXT: popl %ebp 3006; SKX_32-NEXT: .cfi_def_cfa %esp, 4 3007; SKX_32-NEXT: retl 3008 %res = call <16 x double> @llvm.masked.gather.v16f64.v16p0(<16 x ptr> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0) 3009 ret <16 x double> %res 3010} 3011declare <16 x double> @llvm.masked.gather.v16f64.v16p0(<16 x ptr> %ptrs, i32, <16 x i1> %mask, <16 x double> %src0) 3012define void @test_scatter_16i32(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x i32> %src0) { 3013; KNL_64-LABEL: test_scatter_16i32: 3014; KNL_64: # %bb.0: 3015; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 3016; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 3017; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 3018; KNL_64-NEXT: kshiftrw $8, %k1, %k2 3019; KNL_64-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1} 3020; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm0 3021; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2} 3022; KNL_64-NEXT: vzeroupper 3023; KNL_64-NEXT: retq 3024; 3025; KNL_32-LABEL: test_scatter_16i32: 3026; KNL_32: # %bb.0: 3027; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 3028; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 3029; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 3030; KNL_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1} 3031; KNL_32-NEXT: vzeroupper 3032; KNL_32-NEXT: retl 3033; 3034; SKX-LABEL: test_scatter_16i32: 3035; SKX: # %bb.0: 3036; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 3037; SKX-NEXT: vpslld $31, %zmm2, %zmm2 3038; SKX-NEXT: vpmovd2m %zmm2, %k1 3039; SKX-NEXT: kshiftrw $8, %k1, %k2 3040; SKX-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1} 3041; SKX-NEXT: vextracti64x4 $1, %zmm3, %ymm0 3042; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2} 3043; SKX-NEXT: vzeroupper 3044; SKX-NEXT: retq 3045; 3046; SKX_32-LABEL: test_scatter_16i32: 3047; SKX_32: # %bb.0: 3048; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 3049; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 3050; SKX_32-NEXT: vpmovd2m %zmm1, %k1 3051; SKX_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1} 3052; SKX_32-NEXT: vzeroupper 3053; SKX_32-NEXT: retl 3054 call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %src0, <16 x ptr> %ptrs, i32 4, <16 x i1> %mask) 3055 ret void 3056} 3057define void @test_scatter_16i64(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x i64> %src0) { 3058; KNL_64-LABEL: test_scatter_16i64: 3059; KNL_64: # %bb.0: 3060; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 3061; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 3062; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 3063; KNL_64-NEXT: kshiftrw $8, %k1, %k2 3064; KNL_64-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1} 3065; KNL_64-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2} 3066; KNL_64-NEXT: vzeroupper 3067; KNL_64-NEXT: retq 3068; 3069; KNL_32-LABEL: test_scatter_16i64: 3070; KNL_32: # %bb.0: 3071; KNL_32-NEXT: pushl %ebp 3072; KNL_32-NEXT: .cfi_def_cfa_offset 8 3073; KNL_32-NEXT: .cfi_offset %ebp, -8 3074; KNL_32-NEXT: movl %esp, %ebp 3075; KNL_32-NEXT: .cfi_def_cfa_register %ebp 3076; KNL_32-NEXT: andl $-64, %esp 3077; KNL_32-NEXT: subl $64, %esp 3078; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 3079; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 3080; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 3081; KNL_32-NEXT: vmovdqa64 8(%ebp), %zmm1 3082; KNL_32-NEXT: kshiftrw $8, %k1, %k2 3083; KNL_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1} 3084; KNL_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0 3085; KNL_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2} 3086; KNL_32-NEXT: movl %ebp, %esp 3087; KNL_32-NEXT: popl %ebp 3088; KNL_32-NEXT: .cfi_def_cfa %esp, 4 3089; KNL_32-NEXT: vzeroupper 3090; KNL_32-NEXT: retl 3091; 3092; SKX-LABEL: test_scatter_16i64: 3093; SKX: # %bb.0: 3094; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 3095; SKX-NEXT: vpslld $31, %zmm2, %zmm2 3096; SKX-NEXT: vpmovd2m %zmm2, %k1 3097; SKX-NEXT: kshiftrw $8, %k1, %k2 3098; SKX-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1} 3099; SKX-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2} 3100; SKX-NEXT: vzeroupper 3101; SKX-NEXT: retq 3102; 3103; SKX_32-LABEL: test_scatter_16i64: 3104; SKX_32: # %bb.0: 3105; SKX_32-NEXT: pushl %ebp 3106; SKX_32-NEXT: .cfi_def_cfa_offset 8 3107; SKX_32-NEXT: .cfi_offset %ebp, -8 3108; SKX_32-NEXT: movl %esp, %ebp 3109; SKX_32-NEXT: .cfi_def_cfa_register %ebp 3110; SKX_32-NEXT: andl $-64, %esp 3111; SKX_32-NEXT: subl $64, %esp 3112; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 3113; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 3114; SKX_32-NEXT: vpmovd2m %zmm1, %k1 3115; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1 3116; SKX_32-NEXT: kshiftrw $8, %k1, %k2 3117; SKX_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1} 3118; SKX_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0 3119; SKX_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2} 3120; SKX_32-NEXT: movl %ebp, %esp 3121; SKX_32-NEXT: popl %ebp 3122; SKX_32-NEXT: .cfi_def_cfa %esp, 4 3123; SKX_32-NEXT: vzeroupper 3124; SKX_32-NEXT: retl 3125 call void @llvm.masked.scatter.v16i64.v16p0(<16 x i64> %src0, <16 x ptr> %ptrs, i32 4, <16 x i1> %mask) 3126 ret void 3127} 3128declare void @llvm.masked.scatter.v16i64.v16p0(<16 x i64> %src0, <16 x ptr> %ptrs, i32, <16 x i1> %mask) 3129define void @test_scatter_16f32(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x float> %src0) { 3130; KNL_64-LABEL: test_scatter_16f32: 3131; KNL_64: # %bb.0: 3132; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 3133; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 3134; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 3135; KNL_64-NEXT: kshiftrw $8, %k1, %k2 3136; KNL_64-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1} 3137; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm0 3138; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2} 3139; KNL_64-NEXT: vzeroupper 3140; KNL_64-NEXT: retq 3141; 3142; KNL_32-LABEL: test_scatter_16f32: 3143; KNL_32: # %bb.0: 3144; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 3145; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 3146; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 3147; KNL_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1} 3148; KNL_32-NEXT: vzeroupper 3149; KNL_32-NEXT: retl 3150; 3151; SKX-LABEL: test_scatter_16f32: 3152; SKX: # %bb.0: 3153; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 3154; SKX-NEXT: vpslld $31, %zmm2, %zmm2 3155; SKX-NEXT: vpmovd2m %zmm2, %k1 3156; SKX-NEXT: kshiftrw $8, %k1, %k2 3157; SKX-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1} 3158; SKX-NEXT: vextractf64x4 $1, %zmm3, %ymm0 3159; SKX-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2} 3160; SKX-NEXT: vzeroupper 3161; SKX-NEXT: retq 3162; 3163; SKX_32-LABEL: test_scatter_16f32: 3164; SKX_32: # %bb.0: 3165; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 3166; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 3167; SKX_32-NEXT: vpmovd2m %zmm1, %k1 3168; SKX_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1} 3169; SKX_32-NEXT: vzeroupper 3170; SKX_32-NEXT: retl 3171 call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %src0, <16 x ptr> %ptrs, i32 4, <16 x i1> %mask) 3172 ret void 3173} 3174declare void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %src0, <16 x ptr> %ptrs, i32, <16 x i1> %mask) 3175define void @test_scatter_16f64(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x double> %src0) { 3176; KNL_64-LABEL: test_scatter_16f64: 3177; KNL_64: # %bb.0: 3178; KNL_64-NEXT: vpmovsxbd %xmm2, %zmm2 3179; KNL_64-NEXT: vpslld $31, %zmm2, %zmm2 3180; KNL_64-NEXT: vptestmd %zmm2, %zmm2, %k1 3181; KNL_64-NEXT: kshiftrw $8, %k1, %k2 3182; KNL_64-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1} 3183; KNL_64-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2} 3184; KNL_64-NEXT: vzeroupper 3185; KNL_64-NEXT: retq 3186; 3187; KNL_32-LABEL: test_scatter_16f64: 3188; KNL_32: # %bb.0: 3189; KNL_32-NEXT: pushl %ebp 3190; KNL_32-NEXT: .cfi_def_cfa_offset 8 3191; KNL_32-NEXT: .cfi_offset %ebp, -8 3192; KNL_32-NEXT: movl %esp, %ebp 3193; KNL_32-NEXT: .cfi_def_cfa_register %ebp 3194; KNL_32-NEXT: andl $-64, %esp 3195; KNL_32-NEXT: subl $64, %esp 3196; KNL_32-NEXT: vpmovsxbd %xmm1, %zmm1 3197; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1 3198; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1 3199; KNL_32-NEXT: vmovapd 8(%ebp), %zmm1 3200; KNL_32-NEXT: kshiftrw $8, %k1, %k2 3201; KNL_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1} 3202; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0 3203; KNL_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2} 3204; KNL_32-NEXT: movl %ebp, %esp 3205; KNL_32-NEXT: popl %ebp 3206; KNL_32-NEXT: .cfi_def_cfa %esp, 4 3207; KNL_32-NEXT: vzeroupper 3208; KNL_32-NEXT: retl 3209; 3210; SKX-LABEL: test_scatter_16f64: 3211; SKX: # %bb.0: 3212; SKX-NEXT: vpmovsxbd %xmm2, %zmm2 3213; SKX-NEXT: vpslld $31, %zmm2, %zmm2 3214; SKX-NEXT: vpmovd2m %zmm2, %k1 3215; SKX-NEXT: kshiftrw $8, %k1, %k2 3216; SKX-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1} 3217; SKX-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2} 3218; SKX-NEXT: vzeroupper 3219; SKX-NEXT: retq 3220; 3221; SKX_32-LABEL: test_scatter_16f64: 3222; SKX_32: # %bb.0: 3223; SKX_32-NEXT: pushl %ebp 3224; SKX_32-NEXT: .cfi_def_cfa_offset 8 3225; SKX_32-NEXT: .cfi_offset %ebp, -8 3226; SKX_32-NEXT: movl %esp, %ebp 3227; SKX_32-NEXT: .cfi_def_cfa_register %ebp 3228; SKX_32-NEXT: andl $-64, %esp 3229; SKX_32-NEXT: subl $64, %esp 3230; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1 3231; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1 3232; SKX_32-NEXT: vpmovd2m %zmm1, %k1 3233; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1 3234; SKX_32-NEXT: kshiftrw $8, %k1, %k2 3235; SKX_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1} 3236; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0 3237; SKX_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2} 3238; SKX_32-NEXT: movl %ebp, %esp 3239; SKX_32-NEXT: popl %ebp 3240; SKX_32-NEXT: .cfi_def_cfa %esp, 4 3241; SKX_32-NEXT: vzeroupper 3242; SKX_32-NEXT: retl 3243 call void @llvm.masked.scatter.v16f64.v16p0(<16 x double> %src0, <16 x ptr> %ptrs, i32 4, <16 x i1> %mask) 3244 ret void 3245} 3246declare void @llvm.masked.scatter.v16f64.v16p0(<16 x double> %src0, <16 x ptr> %ptrs, i32, <16 x i1> %mask) 3247 3248define <4 x i64> @test_pr28312(<4 x ptr> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i64> %d) { 3249; KNL_64-LABEL: test_pr28312: 3250; KNL_64: # %bb.0: 3251; KNL_64-NEXT: vpslld $31, %xmm1, %xmm1 3252; KNL_64-NEXT: vptestmd %zmm1, %zmm1, %k0 3253; KNL_64-NEXT: kmovw %k0, %eax 3254; KNL_64-NEXT: testb $1, %al 3255; KNL_64-NEXT: # implicit-def: $ymm1 3256; KNL_64-NEXT: je .LBB42_2 3257; KNL_64-NEXT: # %bb.1: # %cond.load 3258; KNL_64-NEXT: vmovq %xmm0, %rcx 3259; KNL_64-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 3260; KNL_64-NEXT: .LBB42_2: # %else 3261; KNL_64-NEXT: testb $2, %al 3262; KNL_64-NEXT: je .LBB42_4 3263; KNL_64-NEXT: # %bb.3: # %cond.load1 3264; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx 3265; KNL_64-NEXT: vpinsrq $1, (%rcx), %xmm1, %xmm2 3266; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 3267; KNL_64-NEXT: .LBB42_4: # %else2 3268; KNL_64-NEXT: testb $4, %al 3269; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm2 3270; KNL_64-NEXT: je .LBB42_6 3271; KNL_64-NEXT: # %bb.5: # %cond.load4 3272; KNL_64-NEXT: vmovq %xmm2, %rcx 3273; KNL_64-NEXT: vpbroadcastq (%rcx), %ymm3 3274; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5],ymm1[6,7] 3275; KNL_64-NEXT: .LBB42_6: # %else5 3276; KNL_64-NEXT: testb $8, %al 3277; KNL_64-NEXT: je .LBB42_8 3278; KNL_64-NEXT: # %bb.7: # %cond.load7 3279; KNL_64-NEXT: vpextrq $1, %xmm2, %rax 3280; KNL_64-NEXT: vpbroadcastq (%rax), %ymm3 3281; KNL_64-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7] 3282; KNL_64-NEXT: .LBB42_8: # %else8 3283; KNL_64-NEXT: kmovw %k0, %eax 3284; KNL_64-NEXT: testb $1, %al 3285; KNL_64-NEXT: # implicit-def: $ymm3 3286; KNL_64-NEXT: jne .LBB42_9 3287; KNL_64-NEXT: # %bb.10: # %else15 3288; KNL_64-NEXT: testb $2, %al 3289; KNL_64-NEXT: jne .LBB42_11 3290; KNL_64-NEXT: .LBB42_12: # %else21 3291; KNL_64-NEXT: testb $4, %al 3292; KNL_64-NEXT: jne .LBB42_13 3293; KNL_64-NEXT: .LBB42_14: # %else27 3294; KNL_64-NEXT: testb $8, %al 3295; KNL_64-NEXT: je .LBB42_16 3296; KNL_64-NEXT: .LBB42_15: # %cond.load29 3297; KNL_64-NEXT: vpextrq $1, %xmm2, %rax 3298; KNL_64-NEXT: vpbroadcastq (%rax), %ymm4 3299; KNL_64-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] 3300; KNL_64-NEXT: .LBB42_16: # %else33 3301; KNL_64-NEXT: kmovw %k0, %eax 3302; KNL_64-NEXT: testb $1, %al 3303; KNL_64-NEXT: # implicit-def: $ymm4 3304; KNL_64-NEXT: jne .LBB42_17 3305; KNL_64-NEXT: # %bb.18: # %else40 3306; KNL_64-NEXT: testb $2, %al 3307; KNL_64-NEXT: jne .LBB42_19 3308; KNL_64-NEXT: .LBB42_20: # %else46 3309; KNL_64-NEXT: testb $4, %al 3310; KNL_64-NEXT: jne .LBB42_21 3311; KNL_64-NEXT: .LBB42_22: # %else52 3312; KNL_64-NEXT: testb $8, %al 3313; KNL_64-NEXT: je .LBB42_24 3314; KNL_64-NEXT: .LBB42_23: # %cond.load54 3315; KNL_64-NEXT: vpextrq $1, %xmm2, %rax 3316; KNL_64-NEXT: vpbroadcastq (%rax), %ymm0 3317; KNL_64-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm0[6,7] 3318; KNL_64-NEXT: .LBB42_24: # %else58 3319; KNL_64-NEXT: vpaddq %ymm3, %ymm1, %ymm0 3320; KNL_64-NEXT: vpaddq %ymm4, %ymm0, %ymm0 3321; KNL_64-NEXT: retq 3322; KNL_64-NEXT: .LBB42_9: # %cond.load11 3323; KNL_64-NEXT: vmovq %xmm0, %rcx 3324; KNL_64-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero 3325; KNL_64-NEXT: testb $2, %al 3326; KNL_64-NEXT: je .LBB42_12 3327; KNL_64-NEXT: .LBB42_11: # %cond.load17 3328; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx 3329; KNL_64-NEXT: vpinsrq $1, (%rcx), %xmm3, %xmm4 3330; KNL_64-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] 3331; KNL_64-NEXT: testb $4, %al 3332; KNL_64-NEXT: je .LBB42_14 3333; KNL_64-NEXT: .LBB42_13: # %cond.load23 3334; KNL_64-NEXT: vmovq %xmm2, %rcx 3335; KNL_64-NEXT: vpbroadcastq (%rcx), %ymm4 3336; KNL_64-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5],ymm3[6,7] 3337; KNL_64-NEXT: testb $8, %al 3338; KNL_64-NEXT: jne .LBB42_15 3339; KNL_64-NEXT: jmp .LBB42_16 3340; KNL_64-NEXT: .LBB42_17: # %cond.load36 3341; KNL_64-NEXT: vmovq %xmm0, %rcx 3342; KNL_64-NEXT: vmovq {{.*#+}} xmm4 = mem[0],zero 3343; KNL_64-NEXT: testb $2, %al 3344; KNL_64-NEXT: je .LBB42_20 3345; KNL_64-NEXT: .LBB42_19: # %cond.load42 3346; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx 3347; KNL_64-NEXT: vpinsrq $1, (%rcx), %xmm4, %xmm0 3348; KNL_64-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3],ymm4[4,5,6,7] 3349; KNL_64-NEXT: testb $4, %al 3350; KNL_64-NEXT: je .LBB42_22 3351; KNL_64-NEXT: .LBB42_21: # %cond.load48 3352; KNL_64-NEXT: vmovq %xmm2, %rcx 3353; KNL_64-NEXT: vpbroadcastq (%rcx), %ymm0 3354; KNL_64-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5],ymm4[6,7] 3355; KNL_64-NEXT: testb $8, %al 3356; KNL_64-NEXT: jne .LBB42_23 3357; KNL_64-NEXT: jmp .LBB42_24 3358; 3359; KNL_32-LABEL: test_pr28312: 3360; KNL_32: # %bb.0: 3361; KNL_32-NEXT: pushl %ebp 3362; KNL_32-NEXT: .cfi_def_cfa_offset 8 3363; KNL_32-NEXT: .cfi_offset %ebp, -8 3364; KNL_32-NEXT: movl %esp, %ebp 3365; KNL_32-NEXT: .cfi_def_cfa_register %ebp 3366; KNL_32-NEXT: pushl %ebx 3367; KNL_32-NEXT: pushl %esi 3368; KNL_32-NEXT: andl $-32, %esp 3369; KNL_32-NEXT: subl $32, %esp 3370; KNL_32-NEXT: .cfi_offset %esi, -16 3371; KNL_32-NEXT: .cfi_offset %ebx, -12 3372; KNL_32-NEXT: vpslld $31, %xmm1, %xmm1 3373; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k0 3374; KNL_32-NEXT: kmovw %k0, %ebx 3375; KNL_32-NEXT: testb $1, %bl 3376; KNL_32-NEXT: vmovd %xmm0, %eax 3377; KNL_32-NEXT: # implicit-def: $ymm1 3378; KNL_32-NEXT: je .LBB42_2 3379; KNL_32-NEXT: # %bb.1: # %cond.load 3380; KNL_32-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 3381; KNL_32-NEXT: .LBB42_2: # %else 3382; KNL_32-NEXT: testb $2, %bl 3383; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx 3384; KNL_32-NEXT: je .LBB42_4 3385; KNL_32-NEXT: # %bb.3: # %cond.load1 3386; KNL_32-NEXT: vpinsrd $2, (%ecx), %xmm1, %xmm2 3387; KNL_32-NEXT: vpinsrd $3, 4(%ecx), %xmm2, %xmm2 3388; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 3389; KNL_32-NEXT: .LBB42_4: # %else2 3390; KNL_32-NEXT: testb $4, %bl 3391; KNL_32-NEXT: vpextrd $2, %xmm0, %edx 3392; KNL_32-NEXT: je .LBB42_6 3393; KNL_32-NEXT: # %bb.5: # %cond.load4 3394; KNL_32-NEXT: vpbroadcastd (%edx), %ymm2 3395; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5,6,7] 3396; KNL_32-NEXT: vpbroadcastd 4(%edx), %ymm2 3397; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] 3398; KNL_32-NEXT: .LBB42_6: # %else5 3399; KNL_32-NEXT: testb $8, %bl 3400; KNL_32-NEXT: vpextrd $3, %xmm0, %esi 3401; KNL_32-NEXT: je .LBB42_8 3402; KNL_32-NEXT: # %bb.7: # %cond.load7 3403; KNL_32-NEXT: vpbroadcastd (%esi), %ymm0 3404; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6],ymm1[7] 3405; KNL_32-NEXT: vpbroadcastd 4(%esi), %ymm1 3406; KNL_32-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,6],ymm1[7] 3407; KNL_32-NEXT: .LBB42_8: # %else8 3408; KNL_32-NEXT: kmovw %k0, %ebx 3409; KNL_32-NEXT: testb $1, %bl 3410; KNL_32-NEXT: # implicit-def: $ymm0 3411; KNL_32-NEXT: jne .LBB42_9 3412; KNL_32-NEXT: # %bb.10: # %else15 3413; KNL_32-NEXT: testb $2, %bl 3414; KNL_32-NEXT: jne .LBB42_11 3415; KNL_32-NEXT: .LBB42_12: # %else21 3416; KNL_32-NEXT: testb $4, %bl 3417; KNL_32-NEXT: jne .LBB42_13 3418; KNL_32-NEXT: .LBB42_14: # %else27 3419; KNL_32-NEXT: testb $8, %bl 3420; KNL_32-NEXT: je .LBB42_16 3421; KNL_32-NEXT: .LBB42_15: # %cond.load29 3422; KNL_32-NEXT: vpbroadcastd (%esi), %ymm2 3423; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7] 3424; KNL_32-NEXT: vpbroadcastd 4(%esi), %ymm2 3425; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7] 3426; KNL_32-NEXT: .LBB42_16: # %else33 3427; KNL_32-NEXT: kmovw %k0, %ebx 3428; KNL_32-NEXT: testb $1, %bl 3429; KNL_32-NEXT: # implicit-def: $ymm2 3430; KNL_32-NEXT: jne .LBB42_17 3431; KNL_32-NEXT: # %bb.18: # %else40 3432; KNL_32-NEXT: testb $2, %bl 3433; KNL_32-NEXT: jne .LBB42_19 3434; KNL_32-NEXT: .LBB42_20: # %else46 3435; KNL_32-NEXT: testb $4, %bl 3436; KNL_32-NEXT: jne .LBB42_21 3437; KNL_32-NEXT: .LBB42_22: # %else52 3438; KNL_32-NEXT: testb $8, %bl 3439; KNL_32-NEXT: je .LBB42_24 3440; KNL_32-NEXT: .LBB42_23: # %cond.load54 3441; KNL_32-NEXT: vpbroadcastd (%esi), %ymm3 3442; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7] 3443; KNL_32-NEXT: vpbroadcastd 4(%esi), %ymm3 3444; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7] 3445; KNL_32-NEXT: .LBB42_24: # %else58 3446; KNL_32-NEXT: vpaddq %ymm0, %ymm1, %ymm0 3447; KNL_32-NEXT: vpaddq %ymm2, %ymm0, %ymm0 3448; KNL_32-NEXT: leal -8(%ebp), %esp 3449; KNL_32-NEXT: popl %esi 3450; KNL_32-NEXT: popl %ebx 3451; KNL_32-NEXT: popl %ebp 3452; KNL_32-NEXT: .cfi_def_cfa %esp, 4 3453; KNL_32-NEXT: retl 3454; KNL_32-NEXT: .LBB42_9: # %cond.load11 3455; KNL_32-NEXT: .cfi_def_cfa %ebp, 8 3456; KNL_32-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 3457; KNL_32-NEXT: testb $2, %bl 3458; KNL_32-NEXT: je .LBB42_12 3459; KNL_32-NEXT: .LBB42_11: # %cond.load17 3460; KNL_32-NEXT: vpinsrd $2, (%ecx), %xmm0, %xmm2 3461; KNL_32-NEXT: vpinsrd $3, 4(%ecx), %xmm2, %xmm2 3462; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 3463; KNL_32-NEXT: testb $4, %bl 3464; KNL_32-NEXT: je .LBB42_14 3465; KNL_32-NEXT: .LBB42_13: # %cond.load23 3466; KNL_32-NEXT: vpbroadcastd (%edx), %ymm2 3467; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7] 3468; KNL_32-NEXT: vpbroadcastd 4(%edx), %ymm2 3469; KNL_32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7] 3470; KNL_32-NEXT: testb $8, %bl 3471; KNL_32-NEXT: jne .LBB42_15 3472; KNL_32-NEXT: jmp .LBB42_16 3473; KNL_32-NEXT: .LBB42_17: # %cond.load36 3474; KNL_32-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero 3475; KNL_32-NEXT: testb $2, %bl 3476; KNL_32-NEXT: je .LBB42_20 3477; KNL_32-NEXT: .LBB42_19: # %cond.load42 3478; KNL_32-NEXT: vpinsrd $2, (%ecx), %xmm2, %xmm3 3479; KNL_32-NEXT: vpinsrd $3, 4(%ecx), %xmm3, %xmm3 3480; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 3481; KNL_32-NEXT: testb $4, %bl 3482; KNL_32-NEXT: je .LBB42_22 3483; KNL_32-NEXT: .LBB42_21: # %cond.load48 3484; KNL_32-NEXT: vpbroadcastd (%edx), %ymm3 3485; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7] 3486; KNL_32-NEXT: vpbroadcastd 4(%edx), %ymm3 3487; KNL_32-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] 3488; KNL_32-NEXT: testb $8, %bl 3489; KNL_32-NEXT: jne .LBB42_23 3490; KNL_32-NEXT: jmp .LBB42_24 3491; 3492; SKX-LABEL: test_pr28312: 3493; SKX: # %bb.0: 3494; SKX-NEXT: vpslld $31, %xmm1, %xmm1 3495; SKX-NEXT: vpmovd2m %xmm1, %k1 3496; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 3497; SKX-NEXT: vpgatherqq (,%ymm0), %ymm1 {%k1} 3498; SKX-NEXT: vpaddq %ymm1, %ymm1, %ymm0 3499; SKX-NEXT: vpaddq %ymm1, %ymm0, %ymm0 3500; SKX-NEXT: retq 3501; 3502; SKX_32-LABEL: test_pr28312: 3503; SKX_32: # %bb.0: 3504; SKX_32-NEXT: pushl %ebp 3505; SKX_32-NEXT: .cfi_def_cfa_offset 8 3506; SKX_32-NEXT: .cfi_offset %ebp, -8 3507; SKX_32-NEXT: movl %esp, %ebp 3508; SKX_32-NEXT: .cfi_def_cfa_register %ebp 3509; SKX_32-NEXT: andl $-32, %esp 3510; SKX_32-NEXT: subl $32, %esp 3511; SKX_32-NEXT: vpslld $31, %xmm1, %xmm1 3512; SKX_32-NEXT: vpmovd2m %xmm1, %k1 3513; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 3514; SKX_32-NEXT: vpgatherdq (,%xmm0), %ymm1 {%k1} 3515; SKX_32-NEXT: vpaddq %ymm1, %ymm1, %ymm0 3516; SKX_32-NEXT: vpaddq %ymm1, %ymm0, %ymm0 3517; SKX_32-NEXT: movl %ebp, %esp 3518; SKX_32-NEXT: popl %ebp 3519; SKX_32-NEXT: .cfi_def_cfa %esp, 4 3520; SKX_32-NEXT: retl 3521 %g1 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> %p1, i32 8, <4 x i1> %k, <4 x i64> undef) 3522 %g2 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> %p1, i32 8, <4 x i1> %k, <4 x i64> undef) 3523 %g3 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr> %p1, i32 8, <4 x i1> %k, <4 x i64> undef) 3524 %a = add <4 x i64> %g1, %g2 3525 %b = add <4 x i64> %a, %g3 3526 ret <4 x i64> %b 3527} 3528declare <4 x i64> @llvm.masked.gather.v4i64.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i64>) 3529 3530define <8 x i32> @test_global_array(<8 x i64> %indxs) { 3531; KNL_64-LABEL: test_global_array: 3532; KNL_64: # %bb.0: 3533; KNL_64-NEXT: kxnorw %k0, %k0, %k1 3534; KNL_64-NEXT: vpxor %xmm1, %xmm1, %xmm1 3535; KNL_64-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} 3536; KNL_64-NEXT: vmovdqa %ymm1, %ymm0 3537; KNL_64-NEXT: retq 3538; 3539; KNL_32-LABEL: test_global_array: 3540; KNL_32: # %bb.0: 3541; KNL_32-NEXT: kxnorw %k0, %k0, %k1 3542; KNL_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 3543; KNL_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} 3544; KNL_32-NEXT: vmovdqa %ymm1, %ymm0 3545; KNL_32-NEXT: retl 3546; 3547; SKX_SMALL-LABEL: test_global_array: 3548; SKX_SMALL: # %bb.0: 3549; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1 3550; SKX_SMALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 3551; SKX_SMALL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} 3552; SKX_SMALL-NEXT: vmovdqa %ymm1, %ymm0 3553; SKX_SMALL-NEXT: retq 3554; 3555; SKX_LARGE-LABEL: test_global_array: 3556; SKX_LARGE: # %bb.0: 3557; SKX_LARGE-NEXT: movabsq $glob_array, %rax 3558; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1 3559; SKX_LARGE-NEXT: vpxor %xmm1, %xmm1, %xmm1 3560; SKX_LARGE-NEXT: vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1} 3561; SKX_LARGE-NEXT: vmovdqa %ymm1, %ymm0 3562; SKX_LARGE-NEXT: retq 3563; 3564; SKX_32-LABEL: test_global_array: 3565; SKX_32: # %bb.0: 3566; SKX_32-NEXT: kxnorw %k0, %k0, %k1 3567; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 3568; SKX_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} 3569; SKX_32-NEXT: vmovdqa %ymm1, %ymm0 3570; SKX_32-NEXT: retl 3571 %p = getelementptr inbounds [16 x i32], ptr @glob_array, i64 0, <8 x i64> %indxs 3572 %g = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef) 3573 ret <8 x i32> %g 3574} 3575 3576define <8 x i32> @test_global_array_zeroinitializer_index(<8 x i64> %indxs) { 3577; KNL_64-LABEL: test_global_array_zeroinitializer_index: 3578; KNL_64: # %bb.0: 3579; KNL_64-NEXT: kxnorw %k0, %k0, %k1 3580; KNL_64-NEXT: vpxor %xmm1, %xmm1, %xmm1 3581; KNL_64-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} 3582; KNL_64-NEXT: vmovdqa %ymm1, %ymm0 3583; KNL_64-NEXT: retq 3584; 3585; KNL_32-LABEL: test_global_array_zeroinitializer_index: 3586; KNL_32: # %bb.0: 3587; KNL_32-NEXT: kxnorw %k0, %k0, %k1 3588; KNL_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 3589; KNL_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} 3590; KNL_32-NEXT: vmovdqa %ymm1, %ymm0 3591; KNL_32-NEXT: retl 3592; 3593; SKX_SMALL-LABEL: test_global_array_zeroinitializer_index: 3594; SKX_SMALL: # %bb.0: 3595; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1 3596; SKX_SMALL-NEXT: vpxor %xmm1, %xmm1, %xmm1 3597; SKX_SMALL-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} 3598; SKX_SMALL-NEXT: vmovdqa %ymm1, %ymm0 3599; SKX_SMALL-NEXT: retq 3600; 3601; SKX_LARGE-LABEL: test_global_array_zeroinitializer_index: 3602; SKX_LARGE: # %bb.0: 3603; SKX_LARGE-NEXT: movabsq $glob_array, %rax 3604; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1 3605; SKX_LARGE-NEXT: vpxor %xmm1, %xmm1, %xmm1 3606; SKX_LARGE-NEXT: vpgatherqd (%rax,%zmm0,4), %ymm1 {%k1} 3607; SKX_LARGE-NEXT: vmovdqa %ymm1, %ymm0 3608; SKX_LARGE-NEXT: retq 3609; 3610; SKX_32-LABEL: test_global_array_zeroinitializer_index: 3611; SKX_32: # %bb.0: 3612; SKX_32-NEXT: kxnorw %k0, %k0, %k1 3613; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 3614; SKX_32-NEXT: vpgatherqd glob_array(,%zmm0,4), %ymm1 {%k1} 3615; SKX_32-NEXT: vmovdqa %ymm1, %ymm0 3616; SKX_32-NEXT: retl 3617 %p = getelementptr inbounds [16 x i32], ptr @glob_array, <8 x i64> zeroinitializer, <8 x i64> %indxs 3618 %g = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> %p, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef) 3619 ret <8 x i32> %g 3620} 3621 3622define void @v1_scatter(<1 x i32>%a1, <1 x ptr> %ptr, <1 x i1> %mask) { 3623; KNL_64-LABEL: v1_scatter: 3624; KNL_64: # %bb.0: 3625; KNL_64-NEXT: testb $1, %dl 3626; KNL_64-NEXT: je .LBB45_2 3627; KNL_64-NEXT: # %bb.1: # %cond.store 3628; KNL_64-NEXT: movl %edi, (%rsi) 3629; KNL_64-NEXT: .LBB45_2: # %else 3630; KNL_64-NEXT: retq 3631; 3632; KNL_32-LABEL: v1_scatter: 3633; KNL_32: # %bb.0: 3634; KNL_32-NEXT: testb $1, {{[0-9]+}}(%esp) 3635; KNL_32-NEXT: je .LBB45_2 3636; KNL_32-NEXT: # %bb.1: # %cond.store 3637; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 3638; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx 3639; KNL_32-NEXT: movl %ecx, (%eax) 3640; KNL_32-NEXT: .LBB45_2: # %else 3641; KNL_32-NEXT: retl 3642; 3643; SKX-LABEL: v1_scatter: 3644; SKX: # %bb.0: 3645; SKX-NEXT: testb $1, %dl 3646; SKX-NEXT: je .LBB45_2 3647; SKX-NEXT: # %bb.1: # %cond.store 3648; SKX-NEXT: movl %edi, (%rsi) 3649; SKX-NEXT: .LBB45_2: # %else 3650; SKX-NEXT: retq 3651; 3652; SKX_32-LABEL: v1_scatter: 3653; SKX_32: # %bb.0: 3654; SKX_32-NEXT: testb $1, {{[0-9]+}}(%esp) 3655; SKX_32-NEXT: je .LBB45_2 3656; SKX_32-NEXT: # %bb.1: # %cond.store 3657; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 3658; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %ecx 3659; SKX_32-NEXT: movl %ecx, (%eax) 3660; SKX_32-NEXT: .LBB45_2: # %else 3661; SKX_32-NEXT: retl 3662 call void @llvm.masked.scatter.v1i32.v1p0(<1 x i32> %a1, <1 x ptr> %ptr, i32 4, <1 x i1> %mask) 3663 ret void 3664} 3665declare void @llvm.masked.scatter.v1i32.v1p0(<1 x i32>, <1 x ptr>, i32, <1 x i1>) 3666 3667define <1 x i32> @v1_gather(<1 x ptr> %ptr, <1 x i1> %mask, <1 x i32> %src0) { 3668; KNL_64-LABEL: v1_gather: 3669; KNL_64: # %bb.0: 3670; KNL_64-NEXT: movl (%rdi), %eax 3671; KNL_64-NEXT: retq 3672; 3673; KNL_32-LABEL: v1_gather: 3674; KNL_32: # %bb.0: 3675; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 3676; KNL_32-NEXT: movl (%eax), %eax 3677; KNL_32-NEXT: retl 3678; 3679; SKX-LABEL: v1_gather: 3680; SKX: # %bb.0: 3681; SKX-NEXT: movl (%rdi), %eax 3682; SKX-NEXT: retq 3683; 3684; SKX_32-LABEL: v1_gather: 3685; SKX_32: # %bb.0: 3686; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 3687; SKX_32-NEXT: movl (%eax), %eax 3688; SKX_32-NEXT: retl 3689 %res = call <1 x i32> @llvm.masked.gather.v1i32.v1p0(<1 x ptr> %ptr, i32 4, <1 x i1> <i1 true>, <1 x i32> %src0) 3690 ret <1 x i32>%res 3691} 3692declare <1 x i32> @llvm.masked.gather.v1i32.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x i32>) 3693 3694; Make sure we don't crash when the index element type is larger than i64 and we need to widen the result 3695; This experienced a bad interaction when we widened and then tried to split. 3696define <2 x float> @large_index(ptr %base, <2 x i128> %ind, <2 x i1> %mask, <2 x float> %src0) { 3697; KNL_64-LABEL: large_index: 3698; KNL_64: # %bb.0: 3699; KNL_64-NEXT: vpsllq $63, %xmm0, %xmm0 3700; KNL_64-NEXT: vptestmq %zmm0, %zmm0, %k0 3701; KNL_64-NEXT: vmovq %rcx, %xmm0 3702; KNL_64-NEXT: vmovq %rsi, %xmm2 3703; KNL_64-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] 3704; KNL_64-NEXT: vpsllq $2, %xmm0, %xmm0 3705; KNL_64-NEXT: vmovq %rdi, %xmm2 3706; KNL_64-NEXT: vpbroadcastq %xmm2, %xmm2 3707; KNL_64-NEXT: vpaddq %xmm0, %xmm2, %xmm0 3708; KNL_64-NEXT: kmovw %k0, %eax 3709; KNL_64-NEXT: testb $1, %al 3710; KNL_64-NEXT: jne .LBB47_1 3711; KNL_64-NEXT: # %bb.2: # %else 3712; KNL_64-NEXT: testb $2, %al 3713; KNL_64-NEXT: jne .LBB47_3 3714; KNL_64-NEXT: .LBB47_4: # %else2 3715; KNL_64-NEXT: vmovdqa %xmm1, %xmm0 3716; KNL_64-NEXT: vzeroupper 3717; KNL_64-NEXT: retq 3718; KNL_64-NEXT: .LBB47_1: # %cond.load 3719; KNL_64-NEXT: vmovq %xmm0, %rcx 3720; KNL_64-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero 3721; KNL_64-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] 3722; KNL_64-NEXT: testb $2, %al 3723; KNL_64-NEXT: je .LBB47_4 3724; KNL_64-NEXT: .LBB47_3: # %cond.load1 3725; KNL_64-NEXT: vpextrq $1, %xmm0, %rax 3726; KNL_64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] 3727; KNL_64-NEXT: vmovaps %xmm1, %xmm0 3728; KNL_64-NEXT: vzeroupper 3729; KNL_64-NEXT: retq 3730; 3731; KNL_32-LABEL: large_index: 3732; KNL_32: # %bb.0: 3733; KNL_32-NEXT: vpsllq $63, %xmm0, %xmm0 3734; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k0 3735; KNL_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 3736; KNL_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 3737; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0 3738; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2 3739; KNL_32-NEXT: vpaddd %xmm0, %xmm2, %xmm0 3740; KNL_32-NEXT: kmovw %k0, %eax 3741; KNL_32-NEXT: testb $1, %al 3742; KNL_32-NEXT: jne .LBB47_1 3743; KNL_32-NEXT: # %bb.2: # %else 3744; KNL_32-NEXT: testb $2, %al 3745; KNL_32-NEXT: jne .LBB47_3 3746; KNL_32-NEXT: .LBB47_4: # %else2 3747; KNL_32-NEXT: vmovdqa %xmm1, %xmm0 3748; KNL_32-NEXT: vzeroupper 3749; KNL_32-NEXT: retl 3750; KNL_32-NEXT: .LBB47_1: # %cond.load 3751; KNL_32-NEXT: vmovd %xmm0, %ecx 3752; KNL_32-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero 3753; KNL_32-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] 3754; KNL_32-NEXT: testb $2, %al 3755; KNL_32-NEXT: je .LBB47_4 3756; KNL_32-NEXT: .LBB47_3: # %cond.load1 3757; KNL_32-NEXT: vpextrd $1, %xmm0, %eax 3758; KNL_32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] 3759; KNL_32-NEXT: vmovaps %xmm1, %xmm0 3760; KNL_32-NEXT: vzeroupper 3761; KNL_32-NEXT: retl 3762; 3763; SKX-LABEL: large_index: 3764; SKX: # %bb.0: 3765; SKX-NEXT: vpsllq $63, %xmm0, %xmm0 3766; SKX-NEXT: vpmovq2m %xmm0, %k0 3767; SKX-NEXT: vmovq %rcx, %xmm0 3768; SKX-NEXT: vmovq %rsi, %xmm2 3769; SKX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0] 3770; SKX-NEXT: vpsllq $2, %xmm0, %xmm0 3771; SKX-NEXT: vpbroadcastq %rdi, %xmm2 3772; SKX-NEXT: vpaddq %xmm0, %xmm2, %xmm0 3773; SKX-NEXT: kmovw %k0, %eax 3774; SKX-NEXT: testb $1, %al 3775; SKX-NEXT: jne .LBB47_1 3776; SKX-NEXT: # %bb.2: # %else 3777; SKX-NEXT: testb $2, %al 3778; SKX-NEXT: jne .LBB47_3 3779; SKX-NEXT: .LBB47_4: # %else2 3780; SKX-NEXT: vmovdqa %xmm1, %xmm0 3781; SKX-NEXT: retq 3782; SKX-NEXT: .LBB47_1: # %cond.load 3783; SKX-NEXT: vmovq %xmm0, %rcx 3784; SKX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero 3785; SKX-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] 3786; SKX-NEXT: testb $2, %al 3787; SKX-NEXT: je .LBB47_4 3788; SKX-NEXT: .LBB47_3: # %cond.load1 3789; SKX-NEXT: vpextrq $1, %xmm0, %rax 3790; SKX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] 3791; SKX-NEXT: vmovaps %xmm1, %xmm0 3792; SKX-NEXT: retq 3793; 3794; SKX_32-LABEL: large_index: 3795; SKX_32: # %bb.0: 3796; SKX_32-NEXT: vpsllq $63, %xmm0, %xmm0 3797; SKX_32-NEXT: vpmovq2m %xmm0, %k0 3798; SKX_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 3799; SKX_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 3800; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0 3801; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0 3802; SKX_32-NEXT: kmovw %k0, %eax 3803; SKX_32-NEXT: testb $1, %al 3804; SKX_32-NEXT: jne .LBB47_1 3805; SKX_32-NEXT: # %bb.2: # %else 3806; SKX_32-NEXT: testb $2, %al 3807; SKX_32-NEXT: jne .LBB47_3 3808; SKX_32-NEXT: .LBB47_4: # %else2 3809; SKX_32-NEXT: vmovaps %xmm1, %xmm0 3810; SKX_32-NEXT: retl 3811; SKX_32-NEXT: .LBB47_1: # %cond.load 3812; SKX_32-NEXT: vmovd %xmm0, %ecx 3813; SKX_32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 3814; SKX_32-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] 3815; SKX_32-NEXT: testb $2, %al 3816; SKX_32-NEXT: je .LBB47_4 3817; SKX_32-NEXT: .LBB47_3: # %cond.load1 3818; SKX_32-NEXT: vpextrd $1, %xmm0, %eax 3819; SKX_32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3] 3820; SKX_32-NEXT: vmovaps %xmm1, %xmm0 3821; SKX_32-NEXT: retl 3822 %gep.random = getelementptr float, ptr %base, <2 x i128> %ind 3823 %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0) 3824 ret <2 x float>%res 3825} 3826 3827; Make sure we allow index to be sign extended from a smaller than i32 element size. 3828define <16 x float> @sext_i8_index(ptr %base, <16 x i8> %ind) { 3829; KNL_64-LABEL: sext_i8_index: 3830; KNL_64: # %bb.0: 3831; KNL_64-NEXT: vpmovsxbd %xmm0, %zmm1 3832; KNL_64-NEXT: kxnorw %k0, %k0, %k1 3833; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0 3834; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} 3835; KNL_64-NEXT: retq 3836; 3837; KNL_32-LABEL: sext_i8_index: 3838; KNL_32: # %bb.0: 3839; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 3840; KNL_32-NEXT: vpmovsxbd %xmm0, %zmm1 3841; KNL_32-NEXT: kxnorw %k0, %k0, %k1 3842; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 3843; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} 3844; KNL_32-NEXT: retl 3845; 3846; SKX-LABEL: sext_i8_index: 3847; SKX: # %bb.0: 3848; SKX-NEXT: vpmovsxbd %xmm0, %zmm1 3849; SKX-NEXT: kxnorw %k0, %k0, %k1 3850; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 3851; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} 3852; SKX-NEXT: retq 3853; 3854; SKX_32-LABEL: sext_i8_index: 3855; SKX_32: # %bb.0: 3856; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 3857; SKX_32-NEXT: vpmovsxbd %xmm0, %zmm1 3858; SKX_32-NEXT: kxnorw %k0, %k0, %k1 3859; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 3860; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} 3861; SKX_32-NEXT: retl 3862 3863 %sext_ind = sext <16 x i8> %ind to <16 x i64> 3864 %gep.random = getelementptr float, ptr%base, <16 x i64> %sext_ind 3865 3866 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef) 3867 ret <16 x float>%res 3868} 3869 3870; Make sure we allow index to be sign extended from a smaller than i32 element size. 3871define <8 x float> @sext_v8i8_index(ptr %base, <8 x i8> %ind) { 3872; KNL_64-LABEL: sext_v8i8_index: 3873; KNL_64: # %bb.0: 3874; KNL_64-NEXT: vpmovsxbd %xmm0, %ymm1 3875; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0 3876; KNL_64-NEXT: movw $255, %ax 3877; KNL_64-NEXT: kmovw %eax, %k1 3878; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} 3879; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 3880; KNL_64-NEXT: retq 3881; 3882; KNL_32-LABEL: sext_v8i8_index: 3883; KNL_32: # %bb.0: 3884; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 3885; KNL_32-NEXT: vpmovsxbd %xmm0, %ymm1 3886; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 3887; KNL_32-NEXT: movw $255, %cx 3888; KNL_32-NEXT: kmovw %ecx, %k1 3889; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} 3890; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 3891; KNL_32-NEXT: retl 3892; 3893; SKX-LABEL: sext_v8i8_index: 3894; SKX: # %bb.0: 3895; SKX-NEXT: vpmovsxbd %xmm0, %ymm1 3896; SKX-NEXT: kxnorw %k0, %k0, %k1 3897; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 3898; SKX-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1} 3899; SKX-NEXT: retq 3900; 3901; SKX_32-LABEL: sext_v8i8_index: 3902; SKX_32: # %bb.0: 3903; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 3904; SKX_32-NEXT: vpmovsxbd %xmm0, %ymm1 3905; SKX_32-NEXT: kxnorw %k0, %k0, %k1 3906; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 3907; SKX_32-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1} 3908; SKX_32-NEXT: retl 3909 3910 %sext_ind = sext <8 x i8> %ind to <8 x i64> 3911 %gep.random = getelementptr float, ptr%base, <8 x i64> %sext_ind 3912 3913 %res = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> %gep.random, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef) 3914 ret <8 x float>%res 3915} 3916declare <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x float>) 3917 3918; Make sure we also allow index to be zero extended from a smaller than i32 element size. 3919define <16 x float> @zext_i8_index(ptr %base, <16 x i8> %ind) { 3920; KNL_64-LABEL: zext_i8_index: 3921; KNL_64: # %bb.0: 3922; KNL_64-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 3923; KNL_64-NEXT: kxnorw %k0, %k0, %k1 3924; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0 3925; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} 3926; KNL_64-NEXT: retq 3927; 3928; KNL_32-LABEL: zext_i8_index: 3929; KNL_32: # %bb.0: 3930; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 3931; KNL_32-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 3932; KNL_32-NEXT: kxnorw %k0, %k0, %k1 3933; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 3934; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} 3935; KNL_32-NEXT: retl 3936; 3937; SKX-LABEL: zext_i8_index: 3938; SKX: # %bb.0: 3939; SKX-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 3940; SKX-NEXT: kxnorw %k0, %k0, %k1 3941; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 3942; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} 3943; SKX-NEXT: retq 3944; 3945; SKX_32-LABEL: zext_i8_index: 3946; SKX_32: # %bb.0: 3947; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 3948; SKX_32-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero 3949; SKX_32-NEXT: kxnorw %k0, %k0, %k1 3950; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 3951; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} 3952; SKX_32-NEXT: retl 3953 3954 %zext_ind = zext <16 x i8> %ind to <16 x i64> 3955 %gep.random = getelementptr float, ptr%base, <16 x i64> %zext_ind 3956 3957 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef) 3958 ret <16 x float>%res 3959} 3960 3961; Make sure we also allow index to be zero extended from a smaller than i32 element size. 3962define <8 x float> @zext_v8i8_index(ptr %base, <8 x i8> %ind) { 3963; KNL_64-LABEL: zext_v8i8_index: 3964; KNL_64: # %bb.0: 3965; KNL_64-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 3966; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0 3967; KNL_64-NEXT: movw $255, %ax 3968; KNL_64-NEXT: kmovw %eax, %k1 3969; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} 3970; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 3971; KNL_64-NEXT: retq 3972; 3973; KNL_32-LABEL: zext_v8i8_index: 3974; KNL_32: # %bb.0: 3975; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 3976; KNL_32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 3977; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 3978; KNL_32-NEXT: movw $255, %cx 3979; KNL_32-NEXT: kmovw %ecx, %k1 3980; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} 3981; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 3982; KNL_32-NEXT: retl 3983; 3984; SKX-LABEL: zext_v8i8_index: 3985; SKX: # %bb.0: 3986; SKX-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 3987; SKX-NEXT: kxnorw %k0, %k0, %k1 3988; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 3989; SKX-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1} 3990; SKX-NEXT: retq 3991; 3992; SKX_32-LABEL: zext_v8i8_index: 3993; SKX_32: # %bb.0: 3994; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 3995; SKX_32-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero 3996; SKX_32-NEXT: kxnorw %k0, %k0, %k1 3997; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 3998; SKX_32-NEXT: vgatherdps (%eax,%ymm1,4), %ymm0 {%k1} 3999; SKX_32-NEXT: retl 4000 4001 %zext_ind = zext <8 x i8> %ind to <8 x i64> 4002 %gep.random = getelementptr float, ptr%base, <8 x i64> %zext_ind 4003 4004 %res = call <8 x float> @llvm.masked.gather.v8f32.v8p0(<8 x ptr> %gep.random, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x float> undef) 4005 ret <8 x float>%res 4006} 4007 4008; Index requires promotion 4009define void @test_scatter_2i32_index(<2 x double> %a1, ptr %base, <2 x i32> %ind, <2 x i1> %mask) { 4010; KNL_64-LABEL: test_scatter_2i32_index: 4011; KNL_64: # %bb.0: 4012; KNL_64-NEXT: vpsllq $63, %xmm2, %xmm2 4013; KNL_64-NEXT: vptestmq %zmm2, %zmm2, %k0 4014; KNL_64-NEXT: vpmovsxdq %xmm1, %xmm1 4015; KNL_64-NEXT: vpsllq $3, %xmm1, %xmm1 4016; KNL_64-NEXT: vmovq %rdi, %xmm2 4017; KNL_64-NEXT: vpbroadcastq %xmm2, %xmm2 4018; KNL_64-NEXT: vpaddq %xmm1, %xmm2, %xmm1 4019; KNL_64-NEXT: kmovw %k0, %eax 4020; KNL_64-NEXT: testb $1, %al 4021; KNL_64-NEXT: jne .LBB52_1 4022; KNL_64-NEXT: # %bb.2: # %else 4023; KNL_64-NEXT: testb $2, %al 4024; KNL_64-NEXT: jne .LBB52_3 4025; KNL_64-NEXT: .LBB52_4: # %else2 4026; KNL_64-NEXT: vzeroupper 4027; KNL_64-NEXT: retq 4028; KNL_64-NEXT: .LBB52_1: # %cond.store 4029; KNL_64-NEXT: vmovq %xmm1, %rcx 4030; KNL_64-NEXT: vmovlps %xmm0, (%rcx) 4031; KNL_64-NEXT: testb $2, %al 4032; KNL_64-NEXT: je .LBB52_4 4033; KNL_64-NEXT: .LBB52_3: # %cond.store1 4034; KNL_64-NEXT: vpextrq $1, %xmm1, %rax 4035; KNL_64-NEXT: vmovhps %xmm0, (%rax) 4036; KNL_64-NEXT: vzeroupper 4037; KNL_64-NEXT: retq 4038; 4039; KNL_32-LABEL: test_scatter_2i32_index: 4040; KNL_32: # %bb.0: 4041; KNL_32-NEXT: vpsllq $63, %xmm2, %xmm2 4042; KNL_32-NEXT: vptestmq %zmm2, %zmm2, %k0 4043; KNL_32-NEXT: vpslld $3, %xmm1, %xmm1 4044; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm2 4045; KNL_32-NEXT: vpaddd %xmm1, %xmm2, %xmm1 4046; KNL_32-NEXT: kmovw %k0, %eax 4047; KNL_32-NEXT: testb $1, %al 4048; KNL_32-NEXT: jne .LBB52_1 4049; KNL_32-NEXT: # %bb.2: # %else 4050; KNL_32-NEXT: testb $2, %al 4051; KNL_32-NEXT: jne .LBB52_3 4052; KNL_32-NEXT: .LBB52_4: # %else2 4053; KNL_32-NEXT: vzeroupper 4054; KNL_32-NEXT: retl 4055; KNL_32-NEXT: .LBB52_1: # %cond.store 4056; KNL_32-NEXT: vmovd %xmm1, %ecx 4057; KNL_32-NEXT: vmovlps %xmm0, (%ecx) 4058; KNL_32-NEXT: testb $2, %al 4059; KNL_32-NEXT: je .LBB52_4 4060; KNL_32-NEXT: .LBB52_3: # %cond.store1 4061; KNL_32-NEXT: vpextrd $1, %xmm1, %eax 4062; KNL_32-NEXT: vmovhps %xmm0, (%eax) 4063; KNL_32-NEXT: vzeroupper 4064; KNL_32-NEXT: retl 4065; 4066; SKX-LABEL: test_scatter_2i32_index: 4067; SKX: # %bb.0: 4068; SKX-NEXT: vpsllq $63, %xmm2, %xmm2 4069; SKX-NEXT: vpmovq2m %xmm2, %k0 4070; SKX-NEXT: vpbroadcastq %rdi, %xmm2 4071; SKX-NEXT: vpmovsxdq %xmm1, %xmm1 4072; SKX-NEXT: vpsllq $3, %xmm1, %xmm1 4073; SKX-NEXT: vpaddq %xmm1, %xmm2, %xmm1 4074; SKX-NEXT: kmovw %k0, %eax 4075; SKX-NEXT: testb $1, %al 4076; SKX-NEXT: jne .LBB52_1 4077; SKX-NEXT: # %bb.2: # %else 4078; SKX-NEXT: testb $2, %al 4079; SKX-NEXT: jne .LBB52_3 4080; SKX-NEXT: .LBB52_4: # %else2 4081; SKX-NEXT: retq 4082; SKX-NEXT: .LBB52_1: # %cond.store 4083; SKX-NEXT: vmovq %xmm1, %rcx 4084; SKX-NEXT: vmovlps %xmm0, (%rcx) 4085; SKX-NEXT: testb $2, %al 4086; SKX-NEXT: je .LBB52_4 4087; SKX-NEXT: .LBB52_3: # %cond.store1 4088; SKX-NEXT: vpextrq $1, %xmm1, %rax 4089; SKX-NEXT: vmovhps %xmm0, (%rax) 4090; SKX-NEXT: retq 4091; 4092; SKX_32-LABEL: test_scatter_2i32_index: 4093; SKX_32: # %bb.0: 4094; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2 4095; SKX_32-NEXT: vpmovq2m %xmm2, %k0 4096; SKX_32-NEXT: vpslld $3, %xmm1, %xmm1 4097; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm1, %xmm1 4098; SKX_32-NEXT: kmovw %k0, %eax 4099; SKX_32-NEXT: testb $1, %al 4100; SKX_32-NEXT: jne .LBB52_1 4101; SKX_32-NEXT: # %bb.2: # %else 4102; SKX_32-NEXT: testb $2, %al 4103; SKX_32-NEXT: jne .LBB52_3 4104; SKX_32-NEXT: .LBB52_4: # %else2 4105; SKX_32-NEXT: retl 4106; SKX_32-NEXT: .LBB52_1: # %cond.store 4107; SKX_32-NEXT: vmovd %xmm1, %ecx 4108; SKX_32-NEXT: vmovlps %xmm0, (%ecx) 4109; SKX_32-NEXT: testb $2, %al 4110; SKX_32-NEXT: je .LBB52_4 4111; SKX_32-NEXT: .LBB52_3: # %cond.store1 4112; SKX_32-NEXT: vpextrd $1, %xmm1, %eax 4113; SKX_32-NEXT: vmovhps %xmm0, (%eax) 4114; SKX_32-NEXT: retl 4115 %gep = getelementptr double, ptr%base, <2 x i32> %ind 4116 call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> %a1, <2 x ptr> %gep, i32 4, <2 x i1> %mask) 4117 ret void 4118} 4119declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32, <2 x i1>) 4120 4121define <16 x float> @zext_index(ptr %base, <16 x i32> %ind) { 4122; KNL_64-LABEL: zext_index: 4123; KNL_64: # %bb.0: 4124; KNL_64-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm1 4125; KNL_64-NEXT: kxnorw %k0, %k0, %k1 4126; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0 4127; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} 4128; KNL_64-NEXT: retq 4129; 4130; KNL_32-LABEL: zext_index: 4131; KNL_32: # %bb.0: 4132; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 4133; KNL_32-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm1 4134; KNL_32-NEXT: kxnorw %k0, %k0, %k1 4135; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 4136; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} 4137; KNL_32-NEXT: retl 4138; 4139; SKX_SMALL-LABEL: zext_index: 4140; SKX_SMALL: # %bb.0: 4141; SKX_SMALL-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm1 4142; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1 4143; SKX_SMALL-NEXT: vxorps %xmm0, %xmm0, %xmm0 4144; SKX_SMALL-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} 4145; SKX_SMALL-NEXT: retq 4146; 4147; SKX_LARGE-LABEL: zext_index: 4148; SKX_LARGE: # %bb.0: 4149; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax 4150; SKX_LARGE-NEXT: vandps (%rax){1to16}, %zmm0, %zmm1 4151; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1 4152; SKX_LARGE-NEXT: vxorps %xmm0, %xmm0, %xmm0 4153; SKX_LARGE-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} 4154; SKX_LARGE-NEXT: retq 4155; 4156; SKX_32-LABEL: zext_index: 4157; SKX_32: # %bb.0: 4158; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 4159; SKX_32-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm1 4160; SKX_32-NEXT: kxnorw %k0, %k0, %k1 4161; SKX_32-NEXT: vxorps %xmm0, %xmm0, %xmm0 4162; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} 4163; SKX_32-NEXT: retl 4164 %ind_masked = and <16 x i32> %ind, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15> 4165 %sext_ind = zext <16 x i32> %ind_masked to <16 x i64> 4166 %gep.random = getelementptr float, ptr%base, <16 x i64> %sext_ind 4167 4168 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef) 4169 ret <16 x float>%res 4170} 4171 4172define <16 x double> @test_gather_setcc_split(ptr %base, <16 x i32> %ind, <16 x i32> %cmp, <16 x double> %passthru) { 4173; KNL_64-LABEL: test_gather_setcc_split: 4174; KNL_64: # %bb.0: 4175; KNL_64-NEXT: vextracti64x4 $1, %zmm1, %ymm4 4176; KNL_64-NEXT: vptestnmd %zmm4, %zmm4, %k1 4177; KNL_64-NEXT: vptestnmd %zmm1, %zmm1, %k2 4178; KNL_64-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k2} 4179; KNL_64-NEXT: vextractf64x4 $1, %zmm0, %ymm0 4180; KNL_64-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm3 {%k1} 4181; KNL_64-NEXT: vmovapd %zmm2, %zmm0 4182; KNL_64-NEXT: vmovapd %zmm3, %zmm1 4183; KNL_64-NEXT: retq 4184; 4185; KNL_32-LABEL: test_gather_setcc_split: 4186; KNL_32: # %bb.0: 4187; KNL_32-NEXT: pushl %ebp 4188; KNL_32-NEXT: .cfi_def_cfa_offset 8 4189; KNL_32-NEXT: .cfi_offset %ebp, -8 4190; KNL_32-NEXT: movl %esp, %ebp 4191; KNL_32-NEXT: .cfi_def_cfa_register %ebp 4192; KNL_32-NEXT: andl $-64, %esp 4193; KNL_32-NEXT: subl $64, %esp 4194; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm3 4195; KNL_32-NEXT: vmovapd 72(%ebp), %zmm1 4196; KNL_32-NEXT: movl 8(%ebp), %eax 4197; KNL_32-NEXT: vextracti64x4 $1, %zmm3, %ymm4 4198; KNL_32-NEXT: vptestnmd %zmm4, %zmm4, %k1 4199; KNL_32-NEXT: vptestnmd %zmm3, %zmm3, %k2 4200; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k2} 4201; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0 4202; KNL_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm1 {%k1} 4203; KNL_32-NEXT: vmovapd %zmm2, %zmm0 4204; KNL_32-NEXT: movl %ebp, %esp 4205; KNL_32-NEXT: popl %ebp 4206; KNL_32-NEXT: .cfi_def_cfa %esp, 4 4207; KNL_32-NEXT: retl 4208; 4209; SKX-LABEL: test_gather_setcc_split: 4210; SKX: # %bb.0: 4211; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm4 4212; SKX-NEXT: vptestnmd %ymm4, %ymm4, %k1 4213; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k2 4214; SKX-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm2 {%k2} 4215; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm0 4216; SKX-NEXT: vgatherdpd (%rdi,%ymm0,8), %zmm3 {%k1} 4217; SKX-NEXT: vmovapd %zmm2, %zmm0 4218; SKX-NEXT: vmovapd %zmm3, %zmm1 4219; SKX-NEXT: retq 4220; 4221; SKX_32-LABEL: test_gather_setcc_split: 4222; SKX_32: # %bb.0: 4223; SKX_32-NEXT: pushl %ebp 4224; SKX_32-NEXT: .cfi_def_cfa_offset 8 4225; SKX_32-NEXT: .cfi_offset %ebp, -8 4226; SKX_32-NEXT: movl %esp, %ebp 4227; SKX_32-NEXT: .cfi_def_cfa_register %ebp 4228; SKX_32-NEXT: andl $-64, %esp 4229; SKX_32-NEXT: subl $64, %esp 4230; SKX_32-NEXT: vmovapd 72(%ebp), %zmm3 4231; SKX_32-NEXT: movl 8(%ebp), %eax 4232; SKX_32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 4233; SKX_32-NEXT: vptestnmd %ymm4, %ymm4, %k1 4234; SKX_32-NEXT: vptestnmd %ymm1, %ymm1, %k2 4235; SKX_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm2 {%k2} 4236; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0 4237; SKX_32-NEXT: vgatherdpd (%eax,%ymm0,8), %zmm3 {%k1} 4238; SKX_32-NEXT: vmovapd %zmm2, %zmm0 4239; SKX_32-NEXT: vmovapd %zmm3, %zmm1 4240; SKX_32-NEXT: movl %ebp, %esp 4241; SKX_32-NEXT: popl %ebp 4242; SKX_32-NEXT: .cfi_def_cfa %esp, 4 4243; SKX_32-NEXT: retl 4244 %sext_ind = sext <16 x i32> %ind to <16 x i64> 4245 %gep.random = getelementptr double, ptr%base, <16 x i64> %sext_ind 4246 4247 %mask = icmp eq <16 x i32> %cmp, zeroinitializer 4248 %res = call <16 x double> @llvm.masked.gather.v16f64.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> %mask, <16 x double> %passthru) 4249 ret <16 x double>%res 4250} 4251 4252define void @test_scatter_setcc_split(ptr %base, <16 x i32> %ind, <16 x i32> %cmp, <16 x double> %src0) { 4253; KNL_64-LABEL: test_scatter_setcc_split: 4254; KNL_64: # %bb.0: 4255; KNL_64-NEXT: vextracti64x4 $1, %zmm1, %ymm4 4256; KNL_64-NEXT: vptestnmd %zmm4, %zmm4, %k1 4257; KNL_64-NEXT: vptestnmd %zmm1, %zmm1, %k2 4258; KNL_64-NEXT: vscatterdpd %zmm2, (%rdi,%ymm0,8) {%k2} 4259; KNL_64-NEXT: vextractf64x4 $1, %zmm0, %ymm0 4260; KNL_64-NEXT: vscatterdpd %zmm3, (%rdi,%ymm0,8) {%k1} 4261; KNL_64-NEXT: vzeroupper 4262; KNL_64-NEXT: retq 4263; 4264; KNL_32-LABEL: test_scatter_setcc_split: 4265; KNL_32: # %bb.0: 4266; KNL_32-NEXT: pushl %ebp 4267; KNL_32-NEXT: .cfi_def_cfa_offset 8 4268; KNL_32-NEXT: .cfi_offset %ebp, -8 4269; KNL_32-NEXT: movl %esp, %ebp 4270; KNL_32-NEXT: .cfi_def_cfa_register %ebp 4271; KNL_32-NEXT: andl $-64, %esp 4272; KNL_32-NEXT: subl $64, %esp 4273; KNL_32-NEXT: vmovapd 72(%ebp), %zmm3 4274; KNL_32-NEXT: movl 8(%ebp), %eax 4275; KNL_32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 4276; KNL_32-NEXT: vptestnmd %zmm4, %zmm4, %k1 4277; KNL_32-NEXT: vptestnmd %zmm1, %zmm1, %k2 4278; KNL_32-NEXT: vscatterdpd %zmm2, (%eax,%ymm0,8) {%k2} 4279; KNL_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0 4280; KNL_32-NEXT: vscatterdpd %zmm3, (%eax,%ymm0,8) {%k1} 4281; KNL_32-NEXT: movl %ebp, %esp 4282; KNL_32-NEXT: popl %ebp 4283; KNL_32-NEXT: .cfi_def_cfa %esp, 4 4284; KNL_32-NEXT: vzeroupper 4285; KNL_32-NEXT: retl 4286; 4287; SKX-LABEL: test_scatter_setcc_split: 4288; SKX: # %bb.0: 4289; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm4 4290; SKX-NEXT: vptestnmd %ymm4, %ymm4, %k1 4291; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k2 4292; SKX-NEXT: vscatterdpd %zmm2, (%rdi,%ymm0,8) {%k2} 4293; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm0 4294; SKX-NEXT: vscatterdpd %zmm3, (%rdi,%ymm0,8) {%k1} 4295; SKX-NEXT: vzeroupper 4296; SKX-NEXT: retq 4297; 4298; SKX_32-LABEL: test_scatter_setcc_split: 4299; SKX_32: # %bb.0: 4300; SKX_32-NEXT: pushl %ebp 4301; SKX_32-NEXT: .cfi_def_cfa_offset 8 4302; SKX_32-NEXT: .cfi_offset %ebp, -8 4303; SKX_32-NEXT: movl %esp, %ebp 4304; SKX_32-NEXT: .cfi_def_cfa_register %ebp 4305; SKX_32-NEXT: andl $-64, %esp 4306; SKX_32-NEXT: subl $64, %esp 4307; SKX_32-NEXT: vmovapd 72(%ebp), %zmm3 4308; SKX_32-NEXT: movl 8(%ebp), %eax 4309; SKX_32-NEXT: vextracti64x4 $1, %zmm1, %ymm4 4310; SKX_32-NEXT: vptestnmd %ymm4, %ymm4, %k1 4311; SKX_32-NEXT: vptestnmd %ymm1, %ymm1, %k2 4312; SKX_32-NEXT: vscatterdpd %zmm2, (%eax,%ymm0,8) {%k2} 4313; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0 4314; SKX_32-NEXT: vscatterdpd %zmm3, (%eax,%ymm0,8) {%k1} 4315; SKX_32-NEXT: movl %ebp, %esp 4316; SKX_32-NEXT: popl %ebp 4317; SKX_32-NEXT: .cfi_def_cfa %esp, 4 4318; SKX_32-NEXT: vzeroupper 4319; SKX_32-NEXT: retl 4320 %sext_ind = sext <16 x i32> %ind to <16 x i64> 4321 %gep.random = getelementptr double, ptr%base, <16 x i64> %sext_ind 4322 4323 %mask = icmp eq <16 x i32> %cmp, zeroinitializer 4324 call void @llvm.masked.scatter.v16f64.v16p0(<16 x double> %src0, <16 x ptr> %gep.random, i32 4, <16 x i1> %mask) 4325 ret void 4326} 4327 4328; This test case previously triggered an infinite loop when the two gathers became identical after DAG combine removed the sign extend. 4329define <16 x float> @test_sext_cse(ptr %base, <16 x i32> %ind, ptr %foo) { 4330; KNL_64-LABEL: test_sext_cse: 4331; KNL_64: # %bb.0: 4332; KNL_64-NEXT: vmovaps %zmm0, (%rsi) 4333; KNL_64-NEXT: kxnorw %k0, %k0, %k1 4334; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1 4335; KNL_64-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 4336; KNL_64-NEXT: vaddps %zmm1, %zmm1, %zmm0 4337; KNL_64-NEXT: retq 4338; 4339; KNL_32-LABEL: test_sext_cse: 4340; KNL_32: # %bb.0: 4341; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 4342; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %ecx 4343; KNL_32-NEXT: vmovaps %zmm0, (%ecx) 4344; KNL_32-NEXT: kxnorw %k0, %k0, %k1 4345; KNL_32-NEXT: vxorps %xmm1, %xmm1, %xmm1 4346; KNL_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} 4347; KNL_32-NEXT: vaddps %zmm1, %zmm1, %zmm0 4348; KNL_32-NEXT: retl 4349; 4350; SKX-LABEL: test_sext_cse: 4351; SKX: # %bb.0: 4352; SKX-NEXT: vmovaps %zmm0, (%rsi) 4353; SKX-NEXT: kxnorw %k0, %k0, %k1 4354; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 4355; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 4356; SKX-NEXT: vaddps %zmm1, %zmm1, %zmm0 4357; SKX-NEXT: retq 4358; 4359; SKX_32-LABEL: test_sext_cse: 4360; SKX_32: # %bb.0: 4361; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 4362; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %ecx 4363; SKX_32-NEXT: vmovaps %zmm0, (%ecx) 4364; SKX_32-NEXT: kxnorw %k0, %k0, %k1 4365; SKX_32-NEXT: vxorps %xmm1, %xmm1, %xmm1 4366; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1} 4367; SKX_32-NEXT: vaddps %zmm1, %zmm1, %zmm0 4368; SKX_32-NEXT: retl 4369 %broadcast.splatinsert = insertelement <16 x ptr> undef, ptr %base, i32 0 4370 %broadcast.splat = shufflevector <16 x ptr> %broadcast.splatinsert, <16 x ptr> undef, <16 x i32> zeroinitializer 4371 4372 %sext_ind = sext <16 x i32> %ind to <16 x i64> 4373 %gep.random = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i64> %sext_ind 4374 4375 store <16 x i32> %ind, ptr %foo 4376 %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef) 4377 %gep.random2 = getelementptr float, <16 x ptr> %broadcast.splat, <16 x i32> %ind 4378 %res2 = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %gep.random2, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef) 4379 %res3 = fadd <16 x float> %res2, %res 4380 ret <16 x float>%res3 4381} 4382 4383define void @zero_mask(<2 x double>%a1, <2 x ptr> %ptr) { 4384; ALL-LABEL: zero_mask: 4385; ALL: # %bb.0: 4386; ALL-NEXT: ret{{[l|q]}} 4387 call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> %a1, <2 x ptr> %ptr, i32 4, <2 x i1> zeroinitializer) 4388 ret void 4389} 4390 4391define <2 x i64> @gather_2i64_constant_indices(ptr %ptr, <2 x i1> %mask) { 4392; KNL_64-LABEL: gather_2i64_constant_indices: 4393; KNL_64: # %bb.0: 4394; KNL_64-NEXT: vpsllq $63, %xmm0, %xmm0 4395; KNL_64-NEXT: vptestmq %zmm0, %zmm0, %k0 4396; KNL_64-NEXT: vmovq %rdi, %xmm0 4397; KNL_64-NEXT: vpbroadcastq %xmm0, %xmm0 4398; KNL_64-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 4399; KNL_64-NEXT: kmovw %k0, %eax 4400; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0 4401; KNL_64-NEXT: testb $1, %al 4402; KNL_64-NEXT: jne .LBB58_1 4403; KNL_64-NEXT: # %bb.2: # %else 4404; KNL_64-NEXT: testb $2, %al 4405; KNL_64-NEXT: jne .LBB58_3 4406; KNL_64-NEXT: .LBB58_4: # %else2 4407; KNL_64-NEXT: vzeroupper 4408; KNL_64-NEXT: retq 4409; KNL_64-NEXT: .LBB58_1: # %cond.load 4410; KNL_64-NEXT: vmovq %xmm1, %rcx 4411; KNL_64-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 4412; KNL_64-NEXT: testb $2, %al 4413; KNL_64-NEXT: je .LBB58_4 4414; KNL_64-NEXT: .LBB58_3: # %cond.load1 4415; KNL_64-NEXT: vpextrq $1, %xmm1, %rax 4416; KNL_64-NEXT: vpinsrq $1, (%rax), %xmm0, %xmm0 4417; KNL_64-NEXT: vzeroupper 4418; KNL_64-NEXT: retq 4419; 4420; KNL_32-LABEL: gather_2i64_constant_indices: 4421; KNL_32: # %bb.0: 4422; KNL_32-NEXT: vpsllq $63, %xmm0, %xmm0 4423; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k0 4424; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 4425; KNL_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 4426; KNL_32-NEXT: kmovw %k0, %eax 4427; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 4428; KNL_32-NEXT: testb $1, %al 4429; KNL_32-NEXT: jne .LBB58_1 4430; KNL_32-NEXT: # %bb.2: # %else 4431; KNL_32-NEXT: testb $2, %al 4432; KNL_32-NEXT: jne .LBB58_3 4433; KNL_32-NEXT: .LBB58_4: # %else2 4434; KNL_32-NEXT: vzeroupper 4435; KNL_32-NEXT: retl 4436; KNL_32-NEXT: .LBB58_1: # %cond.load 4437; KNL_32-NEXT: vmovd %xmm1, %ecx 4438; KNL_32-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 4439; KNL_32-NEXT: testb $2, %al 4440; KNL_32-NEXT: je .LBB58_4 4441; KNL_32-NEXT: .LBB58_3: # %cond.load1 4442; KNL_32-NEXT: vpextrd $1, %xmm1, %eax 4443; KNL_32-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0 4444; KNL_32-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm0 4445; KNL_32-NEXT: vzeroupper 4446; KNL_32-NEXT: retl 4447; 4448; SKX_SMALL-LABEL: gather_2i64_constant_indices: 4449; SKX_SMALL: # %bb.0: 4450; SKX_SMALL-NEXT: vpsllq $63, %xmm0, %xmm0 4451; SKX_SMALL-NEXT: vpmovq2m %xmm0, %k0 4452; SKX_SMALL-NEXT: vpbroadcastq %rdi, %xmm0 4453; SKX_SMALL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 4454; SKX_SMALL-NEXT: kmovw %k0, %eax 4455; SKX_SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0 4456; SKX_SMALL-NEXT: testb $1, %al 4457; SKX_SMALL-NEXT: jne .LBB58_1 4458; SKX_SMALL-NEXT: # %bb.2: # %else 4459; SKX_SMALL-NEXT: testb $2, %al 4460; SKX_SMALL-NEXT: jne .LBB58_3 4461; SKX_SMALL-NEXT: .LBB58_4: # %else2 4462; SKX_SMALL-NEXT: retq 4463; SKX_SMALL-NEXT: .LBB58_1: # %cond.load 4464; SKX_SMALL-NEXT: vmovq %xmm1, %rcx 4465; SKX_SMALL-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 4466; SKX_SMALL-NEXT: testb $2, %al 4467; SKX_SMALL-NEXT: je .LBB58_4 4468; SKX_SMALL-NEXT: .LBB58_3: # %cond.load1 4469; SKX_SMALL-NEXT: vpextrq $1, %xmm1, %rax 4470; SKX_SMALL-NEXT: vpinsrq $1, (%rax), %xmm0, %xmm0 4471; SKX_SMALL-NEXT: retq 4472; 4473; SKX_LARGE-LABEL: gather_2i64_constant_indices: 4474; SKX_LARGE: # %bb.0: 4475; SKX_LARGE-NEXT: vpsllq $63, %xmm0, %xmm0 4476; SKX_LARGE-NEXT: vpmovq2m %xmm0, %k0 4477; SKX_LARGE-NEXT: vpbroadcastq %rdi, %xmm0 4478; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax 4479; SKX_LARGE-NEXT: vpaddq (%rax), %xmm0, %xmm1 4480; SKX_LARGE-NEXT: kmovw %k0, %eax 4481; SKX_LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0 4482; SKX_LARGE-NEXT: testb $1, %al 4483; SKX_LARGE-NEXT: jne .LBB58_1 4484; SKX_LARGE-NEXT: # %bb.2: # %else 4485; SKX_LARGE-NEXT: testb $2, %al 4486; SKX_LARGE-NEXT: jne .LBB58_3 4487; SKX_LARGE-NEXT: .LBB58_4: # %else2 4488; SKX_LARGE-NEXT: retq 4489; SKX_LARGE-NEXT: .LBB58_1: # %cond.load 4490; SKX_LARGE-NEXT: vmovq %xmm1, %rcx 4491; SKX_LARGE-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 4492; SKX_LARGE-NEXT: testb $2, %al 4493; SKX_LARGE-NEXT: je .LBB58_4 4494; SKX_LARGE-NEXT: .LBB58_3: # %cond.load1 4495; SKX_LARGE-NEXT: vpextrq $1, %xmm1, %rax 4496; SKX_LARGE-NEXT: vpinsrq $1, (%rax), %xmm0, %xmm0 4497; SKX_LARGE-NEXT: retq 4498; 4499; SKX_32-LABEL: gather_2i64_constant_indices: 4500; SKX_32: # %bb.0: 4501; SKX_32-NEXT: vpsllq $63, %xmm0, %xmm0 4502; SKX_32-NEXT: vpmovq2m %xmm0, %k0 4503; SKX_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 4504; SKX_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 4505; SKX_32-NEXT: kmovw %k0, %eax 4506; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 4507; SKX_32-NEXT: testb $1, %al 4508; SKX_32-NEXT: jne .LBB58_1 4509; SKX_32-NEXT: # %bb.2: # %else 4510; SKX_32-NEXT: testb $2, %al 4511; SKX_32-NEXT: jne .LBB58_3 4512; SKX_32-NEXT: .LBB58_4: # %else2 4513; SKX_32-NEXT: retl 4514; SKX_32-NEXT: .LBB58_1: # %cond.load 4515; SKX_32-NEXT: vmovd %xmm1, %ecx 4516; SKX_32-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 4517; SKX_32-NEXT: testb $2, %al 4518; SKX_32-NEXT: je .LBB58_4 4519; SKX_32-NEXT: .LBB58_3: # %cond.load1 4520; SKX_32-NEXT: vpextrd $1, %xmm1, %eax 4521; SKX_32-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0 4522; SKX_32-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm0 4523; SKX_32-NEXT: retl 4524 %gep = getelementptr i64, ptr %ptr, <2 x i64> <i64 0, i64 -2> 4525 %res = tail call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> %gep, i32 8, <2 x i1> %mask, <2 x i64> zeroinitializer) #1 4526 ret <2 x i64> %res 4527} 4528 4529define <16 x i32> @gather_16i64_constant_indices(ptr %ptr, <16 x i1> %mask) { 4530; KNL_64-LABEL: gather_16i64_constant_indices: 4531; KNL_64: # %bb.0: 4532; KNL_64-NEXT: vpmovsxbd %xmm0, %zmm0 4533; KNL_64-NEXT: vpslld $31, %zmm0, %zmm0 4534; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1 4535; KNL_64-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609] 4536; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0 4537; KNL_64-NEXT: vpgatherdd (%rdi,%zmm1,4), %zmm0 {%k1} 4538; KNL_64-NEXT: retq 4539; 4540; KNL_32-LABEL: gather_16i64_constant_indices: 4541; KNL_32: # %bb.0: 4542; KNL_32-NEXT: vpmovsxbd %xmm0, %zmm0 4543; KNL_32-NEXT: vpslld $31, %zmm0, %zmm0 4544; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k1 4545; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 4546; KNL_32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609] 4547; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 4548; KNL_32-NEXT: vpgatherdd (%eax,%zmm1,4), %zmm0 {%k1} 4549; KNL_32-NEXT: retl 4550; 4551; SKX_SMALL-LABEL: gather_16i64_constant_indices: 4552; SKX_SMALL: # %bb.0: 4553; SKX_SMALL-NEXT: vpmovsxbd %xmm0, %zmm0 4554; SKX_SMALL-NEXT: vpslld $31, %zmm0, %zmm0 4555; SKX_SMALL-NEXT: vpmovd2m %zmm0, %k1 4556; SKX_SMALL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609] 4557; SKX_SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0 4558; SKX_SMALL-NEXT: vpgatherdd (%rdi,%zmm1,4), %zmm0 {%k1} 4559; SKX_SMALL-NEXT: retq 4560; 4561; SKX_LARGE-LABEL: gather_16i64_constant_indices: 4562; SKX_LARGE: # %bb.0: 4563; SKX_LARGE-NEXT: vpmovsxbd %xmm0, %zmm0 4564; SKX_LARGE-NEXT: vpslld $31, %zmm0, %zmm0 4565; SKX_LARGE-NEXT: vpmovd2m %zmm0, %k1 4566; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax 4567; SKX_LARGE-NEXT: vmovdqa64 (%rax), %zmm1 4568; SKX_LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0 4569; SKX_LARGE-NEXT: vpgatherdd (%rdi,%zmm1,4), %zmm0 {%k1} 4570; SKX_LARGE-NEXT: retq 4571; 4572; SKX_32-LABEL: gather_16i64_constant_indices: 4573; SKX_32: # %bb.0: 4574; SKX_32-NEXT: vpmovsxbd %xmm0, %zmm0 4575; SKX_32-NEXT: vpslld $31, %zmm0, %zmm0 4576; SKX_32-NEXT: vpmovd2m %zmm0, %k1 4577; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 4578; SKX_32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609] 4579; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 4580; SKX_32-NEXT: vpgatherdd (%eax,%zmm1,4), %zmm0 {%k1} 4581; SKX_32-NEXT: retl 4582 %gep = getelementptr i32, ptr %ptr, <16 x i64> <i64 0, i64 -2, i64 1, i64 -8, i64 10, i64 20, i64 50, i64 65536, i64 16777215, i64 2147483647, i64 100, i64 -2000, i64 -2147483648, i64 76897723, i64 7, i64 -67897687> 4583 %res = tail call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> %gep, i32 4, <16 x i1> %mask, <16 x i32> zeroinitializer) #1 4584 ret <16 x i32> %res 4585} 4586 4587define void @scatter_2i64_constant_indices(ptr %ptr, <2 x i1> %mask, <2 x i32> %src0) { 4588; KNL_64-LABEL: scatter_2i64_constant_indices: 4589; KNL_64: # %bb.0: 4590; KNL_64-NEXT: vpsllq $63, %xmm0, %xmm0 4591; KNL_64-NEXT: vptestmq %zmm0, %zmm0, %k0 4592; KNL_64-NEXT: vmovq %rdi, %xmm0 4593; KNL_64-NEXT: vpbroadcastq %xmm0, %xmm0 4594; KNL_64-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4595; KNL_64-NEXT: kmovw %k0, %eax 4596; KNL_64-NEXT: testb $1, %al 4597; KNL_64-NEXT: jne .LBB60_1 4598; KNL_64-NEXT: # %bb.2: # %else 4599; KNL_64-NEXT: testb $2, %al 4600; KNL_64-NEXT: jne .LBB60_3 4601; KNL_64-NEXT: .LBB60_4: # %else2 4602; KNL_64-NEXT: vzeroupper 4603; KNL_64-NEXT: retq 4604; KNL_64-NEXT: .LBB60_1: # %cond.store 4605; KNL_64-NEXT: vmovq %xmm0, %rcx 4606; KNL_64-NEXT: vmovss %xmm1, (%rcx) 4607; KNL_64-NEXT: testb $2, %al 4608; KNL_64-NEXT: je .LBB60_4 4609; KNL_64-NEXT: .LBB60_3: # %cond.store1 4610; KNL_64-NEXT: vpextrq $1, %xmm0, %rax 4611; KNL_64-NEXT: vextractps $1, %xmm1, (%rax) 4612; KNL_64-NEXT: vzeroupper 4613; KNL_64-NEXT: retq 4614; 4615; KNL_32-LABEL: scatter_2i64_constant_indices: 4616; KNL_32: # %bb.0: 4617; KNL_32-NEXT: vpsllq $63, %xmm0, %xmm0 4618; KNL_32-NEXT: vptestmq %zmm0, %zmm0, %k0 4619; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 4620; KNL_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 4621; KNL_32-NEXT: kmovw %k0, %eax 4622; KNL_32-NEXT: testb $1, %al 4623; KNL_32-NEXT: jne .LBB60_1 4624; KNL_32-NEXT: # %bb.2: # %else 4625; KNL_32-NEXT: testb $2, %al 4626; KNL_32-NEXT: jne .LBB60_3 4627; KNL_32-NEXT: .LBB60_4: # %else2 4628; KNL_32-NEXT: vzeroupper 4629; KNL_32-NEXT: retl 4630; KNL_32-NEXT: .LBB60_1: # %cond.store 4631; KNL_32-NEXT: vmovd %xmm0, %ecx 4632; KNL_32-NEXT: vmovss %xmm1, (%ecx) 4633; KNL_32-NEXT: testb $2, %al 4634; KNL_32-NEXT: je .LBB60_4 4635; KNL_32-NEXT: .LBB60_3: # %cond.store1 4636; KNL_32-NEXT: vpextrd $1, %xmm0, %eax 4637; KNL_32-NEXT: vextractps $1, %xmm1, (%eax) 4638; KNL_32-NEXT: vzeroupper 4639; KNL_32-NEXT: retl 4640; 4641; SKX_SMALL-LABEL: scatter_2i64_constant_indices: 4642; SKX_SMALL: # %bb.0: 4643; SKX_SMALL-NEXT: vpsllq $63, %xmm0, %xmm0 4644; SKX_SMALL-NEXT: vpmovq2m %xmm0, %k0 4645; SKX_SMALL-NEXT: vpbroadcastq %rdi, %xmm0 4646; SKX_SMALL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 4647; SKX_SMALL-NEXT: kmovw %k0, %eax 4648; SKX_SMALL-NEXT: testb $1, %al 4649; SKX_SMALL-NEXT: jne .LBB60_1 4650; SKX_SMALL-NEXT: # %bb.2: # %else 4651; SKX_SMALL-NEXT: testb $2, %al 4652; SKX_SMALL-NEXT: jne .LBB60_3 4653; SKX_SMALL-NEXT: .LBB60_4: # %else2 4654; SKX_SMALL-NEXT: retq 4655; SKX_SMALL-NEXT: .LBB60_1: # %cond.store 4656; SKX_SMALL-NEXT: vmovq %xmm0, %rcx 4657; SKX_SMALL-NEXT: vmovss %xmm1, (%rcx) 4658; SKX_SMALL-NEXT: testb $2, %al 4659; SKX_SMALL-NEXT: je .LBB60_4 4660; SKX_SMALL-NEXT: .LBB60_3: # %cond.store1 4661; SKX_SMALL-NEXT: vpextrq $1, %xmm0, %rax 4662; SKX_SMALL-NEXT: vextractps $1, %xmm1, (%rax) 4663; SKX_SMALL-NEXT: retq 4664; 4665; SKX_LARGE-LABEL: scatter_2i64_constant_indices: 4666; SKX_LARGE: # %bb.0: 4667; SKX_LARGE-NEXT: vpsllq $63, %xmm0, %xmm0 4668; SKX_LARGE-NEXT: vpmovq2m %xmm0, %k0 4669; SKX_LARGE-NEXT: vpbroadcastq %rdi, %xmm0 4670; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax 4671; SKX_LARGE-NEXT: vpaddq (%rax), %xmm0, %xmm0 4672; SKX_LARGE-NEXT: kmovw %k0, %eax 4673; SKX_LARGE-NEXT: testb $1, %al 4674; SKX_LARGE-NEXT: jne .LBB60_1 4675; SKX_LARGE-NEXT: # %bb.2: # %else 4676; SKX_LARGE-NEXT: testb $2, %al 4677; SKX_LARGE-NEXT: jne .LBB60_3 4678; SKX_LARGE-NEXT: .LBB60_4: # %else2 4679; SKX_LARGE-NEXT: retq 4680; SKX_LARGE-NEXT: .LBB60_1: # %cond.store 4681; SKX_LARGE-NEXT: vmovq %xmm0, %rcx 4682; SKX_LARGE-NEXT: vmovss %xmm1, (%rcx) 4683; SKX_LARGE-NEXT: testb $2, %al 4684; SKX_LARGE-NEXT: je .LBB60_4 4685; SKX_LARGE-NEXT: .LBB60_3: # %cond.store1 4686; SKX_LARGE-NEXT: vpextrq $1, %xmm0, %rax 4687; SKX_LARGE-NEXT: vextractps $1, %xmm1, (%rax) 4688; SKX_LARGE-NEXT: retq 4689; 4690; SKX_32-LABEL: scatter_2i64_constant_indices: 4691; SKX_32: # %bb.0: 4692; SKX_32-NEXT: vpsllq $63, %xmm0, %xmm0 4693; SKX_32-NEXT: vpmovq2m %xmm0, %k0 4694; SKX_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 4695; SKX_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 4696; SKX_32-NEXT: kmovw %k0, %eax 4697; SKX_32-NEXT: testb $1, %al 4698; SKX_32-NEXT: jne .LBB60_1 4699; SKX_32-NEXT: # %bb.2: # %else 4700; SKX_32-NEXT: testb $2, %al 4701; SKX_32-NEXT: jne .LBB60_3 4702; SKX_32-NEXT: .LBB60_4: # %else2 4703; SKX_32-NEXT: retl 4704; SKX_32-NEXT: .LBB60_1: # %cond.store 4705; SKX_32-NEXT: vmovd %xmm0, %ecx 4706; SKX_32-NEXT: vmovss %xmm1, (%ecx) 4707; SKX_32-NEXT: testb $2, %al 4708; SKX_32-NEXT: je .LBB60_4 4709; SKX_32-NEXT: .LBB60_3: # %cond.store1 4710; SKX_32-NEXT: vpextrd $1, %xmm0, %eax 4711; SKX_32-NEXT: vextractps $1, %xmm1, (%eax) 4712; SKX_32-NEXT: retl 4713 %gep = getelementptr i32, ptr %ptr, <2 x i64> <i64 0, i64 -2> 4714 call void @llvm.masked.scatter.v2i32.v2p0(<2 x i32> %src0, <2 x ptr> %gep, i32 4, <2 x i1> %mask) 4715 ret void 4716} 4717 4718define void @scatter_16i64_constant_indices(ptr %ptr, <16 x i1> %mask, <16 x i32> %src0) { 4719; KNL_64-LABEL: scatter_16i64_constant_indices: 4720; KNL_64: # %bb.0: 4721; KNL_64-NEXT: vpmovsxbd %xmm0, %zmm0 4722; KNL_64-NEXT: vpslld $31, %zmm0, %zmm0 4723; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1 4724; KNL_64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609] 4725; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1} 4726; KNL_64-NEXT: vzeroupper 4727; KNL_64-NEXT: retq 4728; 4729; KNL_32-LABEL: scatter_16i64_constant_indices: 4730; KNL_32: # %bb.0: 4731; KNL_32-NEXT: vpmovsxbd %xmm0, %zmm0 4732; KNL_32-NEXT: vpslld $31, %zmm0, %zmm0 4733; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k1 4734; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 4735; KNL_32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609] 4736; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1} 4737; KNL_32-NEXT: vzeroupper 4738; KNL_32-NEXT: retl 4739; 4740; SKX_SMALL-LABEL: scatter_16i64_constant_indices: 4741; SKX_SMALL: # %bb.0: 4742; SKX_SMALL-NEXT: vpmovsxbd %xmm0, %zmm0 4743; SKX_SMALL-NEXT: vpslld $31, %zmm0, %zmm0 4744; SKX_SMALL-NEXT: vpmovd2m %zmm0, %k1 4745; SKX_SMALL-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609] 4746; SKX_SMALL-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1} 4747; SKX_SMALL-NEXT: vzeroupper 4748; SKX_SMALL-NEXT: retq 4749; 4750; SKX_LARGE-LABEL: scatter_16i64_constant_indices: 4751; SKX_LARGE: # %bb.0: 4752; SKX_LARGE-NEXT: vpmovsxbd %xmm0, %zmm0 4753; SKX_LARGE-NEXT: vpslld $31, %zmm0, %zmm0 4754; SKX_LARGE-NEXT: vpmovd2m %zmm0, %k1 4755; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax 4756; SKX_LARGE-NEXT: vmovdqa64 (%rax), %zmm0 4757; SKX_LARGE-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1} 4758; SKX_LARGE-NEXT: vzeroupper 4759; SKX_LARGE-NEXT: retq 4760; 4761; SKX_32-LABEL: scatter_16i64_constant_indices: 4762; SKX_32: # %bb.0: 4763; SKX_32-NEXT: vpmovsxbd %xmm0, %zmm0 4764; SKX_32-NEXT: vpslld $31, %zmm0, %zmm0 4765; SKX_32-NEXT: vpmovd2m %zmm0, %k1 4766; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 4767; SKX_32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609] 4768; SKX_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1} 4769; SKX_32-NEXT: vzeroupper 4770; SKX_32-NEXT: retl 4771 %gep = getelementptr i32, ptr %ptr, <16 x i64> <i64 0, i64 -2, i64 1, i64 -8, i64 10, i64 20, i64 50, i64 65536, i64 16777215, i64 2147483647, i64 100, i64 -2000, i64 -2147483648, i64 76897723, i64 7, i64 -67897687> 4772 call void @llvm.masked.scatter.v16i32.v16p0(<16 x i32> %src0, <16 x ptr> %gep, i32 4, <16 x i1> %mask) 4773 ret void 4774} 4775 4776define <4 x i32> @splat_ptr_gather(ptr %ptr, <4 x i1> %mask, <4 x i32> %passthru) { 4777; KNL_64-LABEL: splat_ptr_gather: 4778; KNL_64: # %bb.0: 4779; KNL_64-NEXT: vpslld $31, %xmm0, %xmm0 4780; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k0 4781; KNL_64-NEXT: vmovq %rdi, %xmm0 4782; KNL_64-NEXT: vpbroadcastq %xmm0, %ymm0 4783; KNL_64-NEXT: kmovw %k0, %eax 4784; KNL_64-NEXT: testb $1, %al 4785; KNL_64-NEXT: je .LBB62_2 4786; KNL_64-NEXT: # %bb.1: # %cond.load 4787; KNL_64-NEXT: vmovq %xmm0, %rcx 4788; KNL_64-NEXT: vpinsrd $0, (%rcx), %xmm1, %xmm1 4789; KNL_64-NEXT: .LBB62_2: # %else 4790; KNL_64-NEXT: testb $2, %al 4791; KNL_64-NEXT: je .LBB62_4 4792; KNL_64-NEXT: # %bb.3: # %cond.load1 4793; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx 4794; KNL_64-NEXT: vpinsrd $1, (%rcx), %xmm1, %xmm1 4795; KNL_64-NEXT: .LBB62_4: # %else2 4796; KNL_64-NEXT: testb $4, %al 4797; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm0 4798; KNL_64-NEXT: jne .LBB62_5 4799; KNL_64-NEXT: # %bb.6: # %else5 4800; KNL_64-NEXT: testb $8, %al 4801; KNL_64-NEXT: jne .LBB62_7 4802; KNL_64-NEXT: .LBB62_8: # %else8 4803; KNL_64-NEXT: vmovdqa %xmm1, %xmm0 4804; KNL_64-NEXT: vzeroupper 4805; KNL_64-NEXT: retq 4806; KNL_64-NEXT: .LBB62_5: # %cond.load4 4807; KNL_64-NEXT: vmovq %xmm0, %rcx 4808; KNL_64-NEXT: vpinsrd $2, (%rcx), %xmm1, %xmm1 4809; KNL_64-NEXT: testb $8, %al 4810; KNL_64-NEXT: je .LBB62_8 4811; KNL_64-NEXT: .LBB62_7: # %cond.load7 4812; KNL_64-NEXT: vpextrq $1, %xmm0, %rax 4813; KNL_64-NEXT: vpinsrd $3, (%rax), %xmm1, %xmm1 4814; KNL_64-NEXT: vmovdqa %xmm1, %xmm0 4815; KNL_64-NEXT: vzeroupper 4816; KNL_64-NEXT: retq 4817; 4818; KNL_32-LABEL: splat_ptr_gather: 4819; KNL_32: # %bb.0: 4820; KNL_32-NEXT: vpslld $31, %xmm0, %xmm0 4821; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k0 4822; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 4823; KNL_32-NEXT: kmovw %k0, %eax 4824; KNL_32-NEXT: testb $1, %al 4825; KNL_32-NEXT: jne .LBB62_1 4826; KNL_32-NEXT: # %bb.2: # %else 4827; KNL_32-NEXT: testb $2, %al 4828; KNL_32-NEXT: jne .LBB62_3 4829; KNL_32-NEXT: .LBB62_4: # %else2 4830; KNL_32-NEXT: testb $4, %al 4831; KNL_32-NEXT: jne .LBB62_5 4832; KNL_32-NEXT: .LBB62_6: # %else5 4833; KNL_32-NEXT: testb $8, %al 4834; KNL_32-NEXT: jne .LBB62_7 4835; KNL_32-NEXT: .LBB62_8: # %else8 4836; KNL_32-NEXT: vmovdqa %xmm1, %xmm0 4837; KNL_32-NEXT: vzeroupper 4838; KNL_32-NEXT: retl 4839; KNL_32-NEXT: .LBB62_1: # %cond.load 4840; KNL_32-NEXT: vmovd %xmm0, %ecx 4841; KNL_32-NEXT: vpinsrd $0, (%ecx), %xmm1, %xmm1 4842; KNL_32-NEXT: testb $2, %al 4843; KNL_32-NEXT: je .LBB62_4 4844; KNL_32-NEXT: .LBB62_3: # %cond.load1 4845; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx 4846; KNL_32-NEXT: vpinsrd $1, (%ecx), %xmm1, %xmm1 4847; KNL_32-NEXT: testb $4, %al 4848; KNL_32-NEXT: je .LBB62_6 4849; KNL_32-NEXT: .LBB62_5: # %cond.load4 4850; KNL_32-NEXT: vpextrd $2, %xmm0, %ecx 4851; KNL_32-NEXT: vpinsrd $2, (%ecx), %xmm1, %xmm1 4852; KNL_32-NEXT: testb $8, %al 4853; KNL_32-NEXT: je .LBB62_8 4854; KNL_32-NEXT: .LBB62_7: # %cond.load7 4855; KNL_32-NEXT: vpextrd $3, %xmm0, %eax 4856; KNL_32-NEXT: vpinsrd $3, (%eax), %xmm1, %xmm1 4857; KNL_32-NEXT: vmovdqa %xmm1, %xmm0 4858; KNL_32-NEXT: vzeroupper 4859; KNL_32-NEXT: retl 4860; 4861; SKX-LABEL: splat_ptr_gather: 4862; SKX: # %bb.0: 4863; SKX-NEXT: vpslld $31, %xmm0, %xmm0 4864; SKX-NEXT: vpmovd2m %xmm0, %k1 4865; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 4866; SKX-NEXT: vpgatherdd (%rdi,%xmm0,4), %xmm1 {%k1} 4867; SKX-NEXT: vmovdqa %xmm1, %xmm0 4868; SKX-NEXT: retq 4869; 4870; SKX_32-LABEL: splat_ptr_gather: 4871; SKX_32: # %bb.0: 4872; SKX_32-NEXT: vpslld $31, %xmm0, %xmm0 4873; SKX_32-NEXT: vpmovd2m %xmm0, %k1 4874; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 4875; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 4876; SKX_32-NEXT: vpgatherdd (%eax,%xmm0,4), %xmm1 {%k1} 4877; SKX_32-NEXT: vmovdqa %xmm1, %xmm0 4878; SKX_32-NEXT: retl 4879 %1 = insertelement <4 x ptr> undef, ptr %ptr, i32 0 4880 %2 = shufflevector <4 x ptr> %1, <4 x ptr> undef, <4 x i32> zeroinitializer 4881 %3 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %2, i32 4, <4 x i1> %mask, <4 x i32> %passthru) 4882 ret <4 x i32> %3 4883} 4884declare <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr>, i32, <4 x i1>, <4 x i32>) 4885 4886define void @splat_ptr_scatter(ptr %ptr, <4 x i1> %mask, <4 x i32> %val) { 4887; KNL_64-LABEL: splat_ptr_scatter: 4888; KNL_64: # %bb.0: 4889; KNL_64-NEXT: vpslld $31, %xmm0, %xmm0 4890; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k0 4891; KNL_64-NEXT: vmovq %rdi, %xmm0 4892; KNL_64-NEXT: vpbroadcastq %xmm0, %ymm0 4893; KNL_64-NEXT: kmovw %k0, %eax 4894; KNL_64-NEXT: testb $1, %al 4895; KNL_64-NEXT: je .LBB63_2 4896; KNL_64-NEXT: # %bb.1: # %cond.store 4897; KNL_64-NEXT: vmovq %xmm0, %rcx 4898; KNL_64-NEXT: vmovss %xmm1, (%rcx) 4899; KNL_64-NEXT: .LBB63_2: # %else 4900; KNL_64-NEXT: testb $2, %al 4901; KNL_64-NEXT: je .LBB63_4 4902; KNL_64-NEXT: # %bb.3: # %cond.store1 4903; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx 4904; KNL_64-NEXT: vextractps $1, %xmm1, (%rcx) 4905; KNL_64-NEXT: .LBB63_4: # %else2 4906; KNL_64-NEXT: testb $4, %al 4907; KNL_64-NEXT: vextracti128 $1, %ymm0, %xmm0 4908; KNL_64-NEXT: jne .LBB63_5 4909; KNL_64-NEXT: # %bb.6: # %else4 4910; KNL_64-NEXT: testb $8, %al 4911; KNL_64-NEXT: jne .LBB63_7 4912; KNL_64-NEXT: .LBB63_8: # %else6 4913; KNL_64-NEXT: vzeroupper 4914; KNL_64-NEXT: retq 4915; KNL_64-NEXT: .LBB63_5: # %cond.store3 4916; KNL_64-NEXT: vmovq %xmm0, %rcx 4917; KNL_64-NEXT: vextractps $2, %xmm1, (%rcx) 4918; KNL_64-NEXT: testb $8, %al 4919; KNL_64-NEXT: je .LBB63_8 4920; KNL_64-NEXT: .LBB63_7: # %cond.store5 4921; KNL_64-NEXT: vpextrq $1, %xmm0, %rax 4922; KNL_64-NEXT: vextractps $3, %xmm1, (%rax) 4923; KNL_64-NEXT: vzeroupper 4924; KNL_64-NEXT: retq 4925; 4926; KNL_32-LABEL: splat_ptr_scatter: 4927; KNL_32: # %bb.0: 4928; KNL_32-NEXT: vpslld $31, %xmm0, %xmm0 4929; KNL_32-NEXT: vptestmd %zmm0, %zmm0, %k0 4930; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0 4931; KNL_32-NEXT: kmovw %k0, %eax 4932; KNL_32-NEXT: testb $1, %al 4933; KNL_32-NEXT: jne .LBB63_1 4934; KNL_32-NEXT: # %bb.2: # %else 4935; KNL_32-NEXT: testb $2, %al 4936; KNL_32-NEXT: jne .LBB63_3 4937; KNL_32-NEXT: .LBB63_4: # %else2 4938; KNL_32-NEXT: testb $4, %al 4939; KNL_32-NEXT: jne .LBB63_5 4940; KNL_32-NEXT: .LBB63_6: # %else4 4941; KNL_32-NEXT: testb $8, %al 4942; KNL_32-NEXT: jne .LBB63_7 4943; KNL_32-NEXT: .LBB63_8: # %else6 4944; KNL_32-NEXT: vzeroupper 4945; KNL_32-NEXT: retl 4946; KNL_32-NEXT: .LBB63_1: # %cond.store 4947; KNL_32-NEXT: vmovd %xmm0, %ecx 4948; KNL_32-NEXT: vmovss %xmm1, (%ecx) 4949; KNL_32-NEXT: testb $2, %al 4950; KNL_32-NEXT: je .LBB63_4 4951; KNL_32-NEXT: .LBB63_3: # %cond.store1 4952; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx 4953; KNL_32-NEXT: vextractps $1, %xmm1, (%ecx) 4954; KNL_32-NEXT: testb $4, %al 4955; KNL_32-NEXT: je .LBB63_6 4956; KNL_32-NEXT: .LBB63_5: # %cond.store3 4957; KNL_32-NEXT: vpextrd $2, %xmm0, %ecx 4958; KNL_32-NEXT: vextractps $2, %xmm1, (%ecx) 4959; KNL_32-NEXT: testb $8, %al 4960; KNL_32-NEXT: je .LBB63_8 4961; KNL_32-NEXT: .LBB63_7: # %cond.store5 4962; KNL_32-NEXT: vpextrd $3, %xmm0, %eax 4963; KNL_32-NEXT: vextractps $3, %xmm1, (%eax) 4964; KNL_32-NEXT: vzeroupper 4965; KNL_32-NEXT: retl 4966; 4967; SKX-LABEL: splat_ptr_scatter: 4968; SKX: # %bb.0: 4969; SKX-NEXT: vpslld $31, %xmm0, %xmm0 4970; SKX-NEXT: vpmovd2m %xmm0, %k1 4971; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 4972; SKX-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,4) {%k1} 4973; SKX-NEXT: retq 4974; 4975; SKX_32-LABEL: splat_ptr_scatter: 4976; SKX_32: # %bb.0: 4977; SKX_32-NEXT: vpslld $31, %xmm0, %xmm0 4978; SKX_32-NEXT: vpmovd2m %xmm0, %k1 4979; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 4980; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 4981; SKX_32-NEXT: vpscatterdd %xmm1, (%eax,%xmm0,4) {%k1} 4982; SKX_32-NEXT: retl 4983 %1 = insertelement <4 x ptr> undef, ptr %ptr, i32 0 4984 %2 = shufflevector <4 x ptr> %1, <4 x ptr> undef, <4 x i32> zeroinitializer 4985 call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> %val, <4 x ptr> %2, i32 4, <4 x i1> %mask) 4986 ret void 4987} 4988 4989; 4990; PR13310 4991; Failure to fold scaled-index into gather/scatter scale operand. 4992; 4993 4994define <8 x float> @scaleidx_x86gather(ptr %base, <8 x i32> %index, <8 x i32> %imask) nounwind { 4995; KNL_64-LABEL: scaleidx_x86gather: 4996; KNL_64: # %bb.0: 4997; KNL_64-NEXT: vpslld $2, %ymm0, %ymm2 4998; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0 4999; KNL_64-NEXT: vgatherdps %ymm1, (%rdi,%ymm2), %ymm0 5000; KNL_64-NEXT: retq 5001; 5002; KNL_32-LABEL: scaleidx_x86gather: 5003; KNL_32: # %bb.0: 5004; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 5005; KNL_32-NEXT: vxorps %xmm2, %xmm2, %xmm2 5006; KNL_32-NEXT: vgatherdps %ymm1, (%eax,%ymm0,4), %ymm2 5007; KNL_32-NEXT: vmovaps %ymm2, %ymm0 5008; KNL_32-NEXT: retl 5009; 5010; SKX-LABEL: scaleidx_x86gather: 5011; SKX: # %bb.0: 5012; SKX-NEXT: vpslld $2, %ymm0, %ymm2 5013; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 5014; SKX-NEXT: vgatherdps %ymm1, (%rdi,%ymm2), %ymm0 5015; SKX-NEXT: retq 5016; 5017; SKX_32-LABEL: scaleidx_x86gather: 5018; SKX_32: # %bb.0: 5019; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 5020; SKX_32-NEXT: vxorps %xmm2, %xmm2, %xmm2 5021; SKX_32-NEXT: vgatherdps %ymm1, (%eax,%ymm0,4), %ymm2 5022; SKX_32-NEXT: vmovaps %ymm2, %ymm0 5023; SKX_32-NEXT: retl 5024 %mask = bitcast <8 x i32> %imask to <8 x float> 5025 %scaledindex = mul <8 x i32> %index, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4> 5026 %v = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, ptr %base, <8 x i32> %scaledindex, <8 x float> %mask, i8 1) nounwind 5027 ret <8 x float> %v 5028} 5029 5030define <8 x float> @scaleidx_x86gather_outofrange(ptr %base, <8 x i32> %index, <8 x i32> %imask) nounwind { 5031; KNL_64-LABEL: scaleidx_x86gather_outofrange: 5032; KNL_64: # %bb.0: 5033; KNL_64-NEXT: vpslld $2, %ymm0, %ymm2 5034; KNL_64-NEXT: vpxor %xmm0, %xmm0, %xmm0 5035; KNL_64-NEXT: vgatherdps %ymm1, (%rdi,%ymm2,4), %ymm0 5036; KNL_64-NEXT: retq 5037; 5038; KNL_32-LABEL: scaleidx_x86gather_outofrange: 5039; KNL_32: # %bb.0: 5040; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 5041; KNL_32-NEXT: vpslld $2, %ymm0, %ymm2 5042; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 5043; KNL_32-NEXT: vgatherdps %ymm1, (%eax,%ymm2,4), %ymm0 5044; KNL_32-NEXT: retl 5045; 5046; SKX-LABEL: scaleidx_x86gather_outofrange: 5047; SKX: # %bb.0: 5048; SKX-NEXT: vpslld $2, %ymm0, %ymm2 5049; SKX-NEXT: vpxor %xmm0, %xmm0, %xmm0 5050; SKX-NEXT: vgatherdps %ymm1, (%rdi,%ymm2,4), %ymm0 5051; SKX-NEXT: retq 5052; 5053; SKX_32-LABEL: scaleidx_x86gather_outofrange: 5054; SKX_32: # %bb.0: 5055; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 5056; SKX_32-NEXT: vpslld $2, %ymm0, %ymm2 5057; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0 5058; SKX_32-NEXT: vgatherdps %ymm1, (%eax,%ymm2,4), %ymm0 5059; SKX_32-NEXT: retl 5060 %mask = bitcast <8 x i32> %imask to <8 x float> 5061 %scaledindex = mul <8 x i32> %index, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4> 5062 %v = call <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float> undef, ptr %base, <8 x i32> %scaledindex, <8 x float> %mask, i8 4) nounwind 5063 ret <8 x float> %v 5064} 5065declare <8 x float> @llvm.x86.avx2.gather.d.ps.256(<8 x float>, ptr, <8 x i32>, <8 x float>, i8) nounwind readonly 5066 5067define void @scaleidx_x86scatter(<16 x float> %value, ptr %base, <16 x i32> %index, i16 %imask) nounwind { 5068; KNL_64-LABEL: scaleidx_x86scatter: 5069; KNL_64: # %bb.0: 5070; KNL_64-NEXT: kmovw %esi, %k1 5071; KNL_64-NEXT: vpaddd %zmm1, %zmm1, %zmm1 5072; KNL_64-NEXT: vscatterdps %zmm0, (%rdi,%zmm1,2) {%k1} 5073; KNL_64-NEXT: vzeroupper 5074; KNL_64-NEXT: retq 5075; 5076; KNL_32-LABEL: scaleidx_x86scatter: 5077; KNL_32: # %bb.0: 5078; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 5079; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 5080; KNL_32-NEXT: vscatterdps %zmm0, (%eax,%zmm1,4) {%k1} 5081; KNL_32-NEXT: vzeroupper 5082; KNL_32-NEXT: retl 5083; 5084; SKX-LABEL: scaleidx_x86scatter: 5085; SKX: # %bb.0: 5086; SKX-NEXT: kmovw %esi, %k1 5087; SKX-NEXT: vpaddd %zmm1, %zmm1, %zmm1 5088; SKX-NEXT: vscatterdps %zmm0, (%rdi,%zmm1,2) {%k1} 5089; SKX-NEXT: vzeroupper 5090; SKX-NEXT: retq 5091; 5092; SKX_32-LABEL: scaleidx_x86scatter: 5093; SKX_32: # %bb.0: 5094; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 5095; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1 5096; SKX_32-NEXT: vscatterdps %zmm0, (%eax,%zmm1,4) {%k1} 5097; SKX_32-NEXT: vzeroupper 5098; SKX_32-NEXT: retl 5099 %mask = bitcast i16 %imask to <16 x i1> 5100 %scaledindex = shl <16 x i32> %index, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> 5101 call void @llvm.x86.avx512.mask.scatter.dps.512(ptr %base, <16 x i1> %mask, <16 x i32> %scaledindex, <16 x float> %value, i32 2) 5102 ret void 5103} 5104declare void @llvm.x86.avx512.mask.scatter.dps.512(ptr, <16 x i1>, <16 x i32>, <16 x float>, i32) 5105 5106define void @scaleidx_scatter(<8 x float> %value, ptr %base, <8 x i32> %index, i8 %imask) nounwind { 5107; KNL_64-LABEL: scaleidx_scatter: 5108; KNL_64: # %bb.0: 5109; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 5110; KNL_64-NEXT: vpaddd %ymm1, %ymm1, %ymm1 5111; KNL_64-NEXT: kmovw %esi, %k0 5112; KNL_64-NEXT: kshiftlw $8, %k0, %k0 5113; KNL_64-NEXT: kshiftrw $8, %k0, %k1 5114; KNL_64-NEXT: vscatterdps %zmm0, (%rdi,%zmm1,4) {%k1} 5115; KNL_64-NEXT: vzeroupper 5116; KNL_64-NEXT: retq 5117; 5118; KNL_32-LABEL: scaleidx_scatter: 5119; KNL_32: # %bb.0: 5120; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 5121; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 5122; KNL_32-NEXT: vpaddd %ymm1, %ymm1, %ymm1 5123; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx 5124; KNL_32-NEXT: kmovw %ecx, %k0 5125; KNL_32-NEXT: kshiftlw $8, %k0, %k0 5126; KNL_32-NEXT: kshiftrw $8, %k0, %k1 5127; KNL_32-NEXT: vscatterdps %zmm0, (%eax,%zmm1,4) {%k1} 5128; KNL_32-NEXT: vzeroupper 5129; KNL_32-NEXT: retl 5130; 5131; SKX-LABEL: scaleidx_scatter: 5132; SKX: # %bb.0: 5133; SKX-NEXT: vpaddd %ymm1, %ymm1, %ymm1 5134; SKX-NEXT: kmovw %esi, %k1 5135; SKX-NEXT: vscatterdps %ymm0, (%rdi,%ymm1,4) {%k1} 5136; SKX-NEXT: vzeroupper 5137; SKX-NEXT: retq 5138; 5139; SKX_32-LABEL: scaleidx_scatter: 5140; SKX_32: # %bb.0: 5141; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 5142; SKX_32-NEXT: kmovb {{[0-9]+}}(%esp), %k1 5143; SKX_32-NEXT: vscatterdps %ymm0, (%eax,%ymm1,8) {%k1} 5144; SKX_32-NEXT: vzeroupper 5145; SKX_32-NEXT: retl 5146 %scaledindex = mul <8 x i32> %index, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2> 5147 %ptrs = getelementptr float, ptr %base, <8 x i32> %scaledindex 5148 %mask = bitcast i8 %imask to <8 x i1> 5149 call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> %value, <8 x ptr> %ptrs, i32 1, <8 x i1> %mask) 5150 ret void 5151} 5152 5153define void @scaleidx_scatter_outofrange(<8 x float> %value, ptr %base, <8 x i32> %index, i8 %imask) nounwind { 5154; KNL_64-LABEL: scaleidx_scatter_outofrange: 5155; KNL_64: # %bb.0: 5156; KNL_64-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 5157; KNL_64-NEXT: vpslld $2, %ymm1, %ymm1 5158; KNL_64-NEXT: kmovw %esi, %k0 5159; KNL_64-NEXT: kshiftlw $8, %k0, %k0 5160; KNL_64-NEXT: kshiftrw $8, %k0, %k1 5161; KNL_64-NEXT: vscatterdps %zmm0, (%rdi,%zmm1,4) {%k1} 5162; KNL_64-NEXT: vzeroupper 5163; KNL_64-NEXT: retq 5164; 5165; KNL_32-LABEL: scaleidx_scatter_outofrange: 5166; KNL_32: # %bb.0: 5167; KNL_32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 5168; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax 5169; KNL_32-NEXT: vpslld $2, %ymm1, %ymm1 5170; KNL_32-NEXT: movzbl {{[0-9]+}}(%esp), %ecx 5171; KNL_32-NEXT: kmovw %ecx, %k0 5172; KNL_32-NEXT: kshiftlw $8, %k0, %k0 5173; KNL_32-NEXT: kshiftrw $8, %k0, %k1 5174; KNL_32-NEXT: vscatterdps %zmm0, (%eax,%zmm1,4) {%k1} 5175; KNL_32-NEXT: vzeroupper 5176; KNL_32-NEXT: retl 5177; 5178; SKX-LABEL: scaleidx_scatter_outofrange: 5179; SKX: # %bb.0: 5180; SKX-NEXT: vpslld $2, %ymm1, %ymm1 5181; SKX-NEXT: kmovw %esi, %k1 5182; SKX-NEXT: vscatterdps %ymm0, (%rdi,%ymm1,4) {%k1} 5183; SKX-NEXT: vzeroupper 5184; SKX-NEXT: retq 5185; 5186; SKX_32-LABEL: scaleidx_scatter_outofrange: 5187; SKX_32: # %bb.0: 5188; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax 5189; SKX_32-NEXT: vpslld $2, %ymm1, %ymm1 5190; SKX_32-NEXT: kmovb {{[0-9]+}}(%esp), %k1 5191; SKX_32-NEXT: vscatterdps %ymm0, (%eax,%ymm1,4) {%k1} 5192; SKX_32-NEXT: vzeroupper 5193; SKX_32-NEXT: retl 5194 %scaledindex = mul <8 x i32> %index, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4> 5195 %ptrs = getelementptr float, ptr %base, <8 x i32> %scaledindex 5196 %mask = bitcast i8 %imask to <8 x i1> 5197 call void @llvm.masked.scatter.v8f32.v8p0(<8 x float> %value, <8 x ptr> %ptrs, i32 2, <8 x i1> %mask) 5198 ret void 5199} 5200declare void @llvm.masked.scatter.v8f32.v8p0(<8 x float>, <8 x ptr>, i32 immarg, <8 x i1>) 5201 5202; 5203; PR45906 5204; This used to cause fast-isel to generate bad copy instructions that would 5205; cause an error in copyPhysReg. 5206; 5207 5208%struct.foo = type { ptr, i64, i16, i16, i32 } 5209 5210define <8 x i64> @pr45906(<8 x ptr> %ptr) { 5211; KNL_64-LABEL: pr45906: 5212; KNL_64: # %bb.0: # %bb 5213; KNL_64-NEXT: kxnorw %k0, %k0, %k1 5214; KNL_64-NEXT: vpxor %xmm1, %xmm1, %xmm1 5215; KNL_64-NEXT: vpgatherqq 8(,%zmm0), %zmm1 {%k1} 5216; KNL_64-NEXT: vmovdqa64 %zmm1, %zmm0 5217; KNL_64-NEXT: retq 5218; 5219; KNL_32-LABEL: pr45906: 5220; KNL_32: # %bb.0: # %bb 5221; KNL_32-NEXT: kxnorw %k0, %k0, %k1 5222; KNL_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 5223; KNL_32-NEXT: vpgatherdq 4(,%ymm0), %zmm1 {%k1} 5224; KNL_32-NEXT: vmovdqa64 %zmm1, %zmm0 5225; KNL_32-NEXT: retl 5226; 5227; SKX-LABEL: pr45906: 5228; SKX: # %bb.0: # %bb 5229; SKX-NEXT: kxnorw %k0, %k0, %k1 5230; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1 5231; SKX-NEXT: vpgatherqq 8(,%zmm0), %zmm1 {%k1} 5232; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 5233; SKX-NEXT: retq 5234; 5235; SKX_32-LABEL: pr45906: 5236; SKX_32: # %bb.0: # %bb 5237; SKX_32-NEXT: kxnorw %k0, %k0, %k1 5238; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1 5239; SKX_32-NEXT: vpgatherdq 4(,%ymm0), %zmm1 {%k1} 5240; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm0 5241; SKX_32-NEXT: retl 5242bb: 5243 %tmp = getelementptr inbounds %struct.foo, <8 x ptr> %ptr, i64 0, i32 1 5244 %tmp1 = call <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr> %tmp, i32 8, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i64> undef) 5245 ret <8 x i64> %tmp1 5246} 5247declare <8 x i64> @llvm.masked.gather.v8i64.v8p0(<8 x ptr>, i32, <8 x i1>, <8 x i64>) 5248