1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl,+avx512dq | FileCheck %s 3 4define dso_local void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, ptr %base, ptr %stbuf) { 5; CHECK-LABEL: gather_mask_dps: 6; CHECK: # %bb.0: 7; CHECK-NEXT: kmovd %edi, %k1 8; CHECK-NEXT: kmovq %k1, %k2 9; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k2} 10; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 11; CHECK-NEXT: vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1} 12; CHECK-NEXT: vzeroupper 13; CHECK-NEXT: retq 14 %1 = bitcast i16 %mask to <16 x i1> 15 %x = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, ptr %base, <16 x i32> %ind, <16 x i1> %1, i32 4) 16 %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 17 call void @llvm.x86.avx512.mask.scatter.dps.512(ptr %stbuf, <16 x i1> %1, <16 x i32> %ind2, <16 x float> %x, i32 4) 18 ret void 19} 20 21define dso_local void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, ptr %base, ptr %stbuf) { 22; CHECK-LABEL: gather_mask_dpd: 23; CHECK: # %bb.0: 24; CHECK-NEXT: kmovd %edi, %k1 25; CHECK-NEXT: kmovq %k1, %k2 26; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k2} 27; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 28; CHECK-NEXT: vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1} 29; CHECK-NEXT: vzeroupper 30; CHECK-NEXT: retq 31 %1 = bitcast i8 %mask to <8 x i1> 32 %x = call <8 x double> @llvm.x86.avx512.mask.gather.dpd.512(<8 x double> %src, ptr %base, <8 x i32> %ind, <8 x i1> %1, i32 4) 33 %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 34 call void @llvm.x86.avx512.mask.scatter.dpd.512(ptr %stbuf, <8 x i1> %1, <8 x i32> %ind2, <8 x double> %x, i32 4) 35 ret void 36} 37 38define dso_local void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, ptr %base, ptr %stbuf) { 39; CHECK-LABEL: gather_mask_qps: 40; CHECK: # %bb.0: 41; CHECK-NEXT: kmovd %edi, %k1 42; CHECK-NEXT: kmovq %k1, %k2 43; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k2} 44; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 45; CHECK-NEXT: vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1} 46; CHECK-NEXT: vzeroupper 47; CHECK-NEXT: retq 48 %1 = bitcast i8 %mask to <8 x i1> 49 %x = call <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float> %src, ptr %base, <8 x i64> %ind, <8 x i1> %1, i32 4) 50 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3> 51 call void @llvm.x86.avx512.mask.scatter.qps.512(ptr %stbuf, <8 x i1> %1, <8 x i64> %ind2, <8 x float> %x, i32 4) 52 ret void 53} 54 55define dso_local void @gather_mask_qpd(<8 x i64> %ind, <8 x double> %src, i8 %mask, ptr %base, ptr %stbuf) { 56; CHECK-LABEL: gather_mask_qpd: 57; CHECK: # %bb.0: 58; CHECK-NEXT: kmovd %edi, %k1 59; CHECK-NEXT: kmovq %k1, %k2 60; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k2} 61; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 62; CHECK-NEXT: vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1} 63; CHECK-NEXT: vzeroupper 64; CHECK-NEXT: retq 65 %1 = bitcast i8 %mask to <8 x i1> 66 %x = call <8 x double> @llvm.x86.avx512.mask.gather.qpd.512(<8 x double> %src, ptr %base, <8 x i64> %ind, <8 x i1> %1, i32 4) 67 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3> 68 call void @llvm.x86.avx512.mask.scatter.qpd.512(ptr %stbuf, <8 x i1> %1, <8 x i64> %ind2, <8 x double> %x, i32 4) 69 ret void 70} 71;; 72;; Integer Gather/Scatter 73;; 74 75define dso_local void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, ptr %base, ptr %stbuf) { 76; CHECK-LABEL: gather_mask_dd: 77; CHECK: # %bb.0: 78; CHECK-NEXT: kmovd %edi, %k1 79; CHECK-NEXT: kmovq %k1, %k2 80; CHECK-NEXT: vpgatherdd (%rsi,%zmm0,4), %zmm1 {%k2} 81; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 82; CHECK-NEXT: vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1} 83; CHECK-NEXT: vzeroupper 84; CHECK-NEXT: retq 85 %1 = bitcast i16 %mask to <16 x i1> 86 %x = call <16 x i32> @llvm.x86.avx512.mask.gather.dpi.512(<16 x i32> %src, ptr %base, <16 x i32> %ind, <16 x i1> %1, i32 4) 87 %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 88 call void @llvm.x86.avx512.mask.scatter.dpi.512(ptr %stbuf, <16 x i1> %1, <16 x i32> %ind2, <16 x i32> %x, i32 4) 89 ret void 90} 91 92define dso_local void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, ptr %base, ptr %stbuf) { 93; CHECK-LABEL: gather_mask_qd: 94; CHECK: # %bb.0: 95; CHECK-NEXT: kmovd %edi, %k1 96; CHECK-NEXT: kmovq %k1, %k2 97; CHECK-NEXT: vpgatherqd (%rsi,%zmm0,4), %ymm1 {%k2} 98; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 99; CHECK-NEXT: vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1} 100; CHECK-NEXT: vzeroupper 101; CHECK-NEXT: retq 102 %1 = bitcast i8 %mask to <8 x i1> 103 %x = call <8 x i32> @llvm.x86.avx512.mask.gather.qpi.512(<8 x i32> %src, ptr %base, <8 x i64> %ind, <8 x i1> %1, i32 4) 104 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3> 105 call void @llvm.x86.avx512.mask.scatter.qpi.512(ptr %stbuf, <8 x i1> %1, <8 x i64> %ind2, <8 x i32> %x, i32 4) 106 ret void 107} 108 109define dso_local void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, ptr %base, ptr %stbuf) { 110; CHECK-LABEL: gather_mask_qq: 111; CHECK: # %bb.0: 112; CHECK-NEXT: kmovd %edi, %k1 113; CHECK-NEXT: kmovq %k1, %k2 114; CHECK-NEXT: vpgatherqq (%rsi,%zmm0,4), %zmm1 {%k2} 115; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 116; CHECK-NEXT: vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1} 117; CHECK-NEXT: vzeroupper 118; CHECK-NEXT: retq 119 %1 = bitcast i8 %mask to <8 x i1> 120 %x = call <8 x i64> @llvm.x86.avx512.mask.gather.qpq.512(<8 x i64> %src, ptr %base, <8 x i64> %ind, <8 x i1> %1, i32 4) 121 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3> 122 call void @llvm.x86.avx512.mask.scatter.qpq.512(ptr %stbuf, <8 x i1> %1, <8 x i64> %ind2, <8 x i64> %x, i32 4) 123 ret void 124} 125 126define dso_local void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, ptr %base, ptr %stbuf) { 127; CHECK-LABEL: gather_mask_dq: 128; CHECK: # %bb.0: 129; CHECK-NEXT: kmovd %edi, %k1 130; CHECK-NEXT: kmovq %k1, %k2 131; CHECK-NEXT: vpgatherdq (%rsi,%ymm0,4), %zmm1 {%k2} 132; CHECK-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 133; CHECK-NEXT: vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1} 134; CHECK-NEXT: vzeroupper 135; CHECK-NEXT: retq 136 %1 = bitcast i8 %mask to <8 x i1> 137 %x = call <8 x i64> @llvm.x86.avx512.mask.gather.dpq.512(<8 x i64> %src, ptr %base, <8 x i32> %ind, <8 x i1> %1, i32 4) 138 %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 139 call void @llvm.x86.avx512.mask.scatter.dpq.512(ptr %stbuf, <8 x i1> %1, <8 x i32> %ind2, <8 x i64> %x, i32 4) 140 ret void 141} 142 143define dso_local void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %mask, ptr %base, ptr %stbuf) { 144; CHECK-LABEL: gather_mask_dpd_execdomain: 145; CHECK: # %bb.0: 146; CHECK-NEXT: kmovd %edi, %k1 147; CHECK-NEXT: vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k1} 148; CHECK-NEXT: vmovapd %zmm1, (%rdx) 149; CHECK-NEXT: vzeroupper 150; CHECK-NEXT: retq 151 %1 = bitcast i8 %mask to <8 x i1> 152 %x = call <8 x double> @llvm.x86.avx512.mask.gather.dpd.512(<8 x double> %src, ptr %base, <8 x i32> %ind, <8 x i1> %1, i32 4) 153 store <8 x double> %x, ptr %stbuf 154 ret void 155} 156 157define dso_local void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %mask, ptr %base, ptr %stbuf) { 158; CHECK-LABEL: gather_mask_qpd_execdomain: 159; CHECK: # %bb.0: 160; CHECK-NEXT: kmovd %edi, %k1 161; CHECK-NEXT: vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k1} 162; CHECK-NEXT: vmovapd %zmm1, (%rdx) 163; CHECK-NEXT: vzeroupper 164; CHECK-NEXT: retq 165 %1 = bitcast i8 %mask to <8 x i1> 166 %x = call <8 x double> @llvm.x86.avx512.mask.gather.qpd.512(<8 x double> %src, ptr %base, <8 x i64> %ind, <8 x i1> %1, i32 4) 167 store <8 x double> %x, ptr %stbuf 168 ret void 169} 170 171define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %src, i16 %mask, ptr %base) { 172; CHECK-LABEL: gather_mask_dps_execdomain: 173; CHECK: # %bb.0: 174; CHECK-NEXT: kmovd %edi, %k1 175; CHECK-NEXT: vgatherdps (%rsi,%zmm0,4), %zmm1 {%k1} 176; CHECK-NEXT: vmovaps %zmm1, %zmm0 177; CHECK-NEXT: retq 178 %1 = bitcast i16 %mask to <16 x i1> 179 %res = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, ptr %base, <16 x i32> %ind, <16 x i1> %1, i32 4) 180 ret <16 x float> %res 181} 182 183define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src, i8 %mask, ptr %base) { 184; CHECK-LABEL: gather_mask_qps_execdomain: 185; CHECK: # %bb.0: 186; CHECK-NEXT: kmovd %edi, %k1 187; CHECK-NEXT: vgatherqps (%rsi,%zmm0,4), %ymm1 {%k1} 188; CHECK-NEXT: vmovaps %ymm1, %ymm0 189; CHECK-NEXT: retq 190 %1 = bitcast i8 %mask to <8 x i1> 191 %res = call <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float> %src, ptr %base, <8 x i64> %ind, <8 x i1> %1, i32 4) 192 ret <8 x float> %res 193} 194 195define dso_local void @scatter_mask_dpd_execdomain(<8 x i32> %ind, ptr %src, i8 %mask, ptr %base, ptr %stbuf) { 196; CHECK-LABEL: scatter_mask_dpd_execdomain: 197; CHECK: # %bb.0: 198; CHECK-NEXT: kmovd %esi, %k1 199; CHECK-NEXT: vmovapd (%rdi), %zmm1 200; CHECK-NEXT: vscatterdpd %zmm1, (%rcx,%ymm0,4) {%k1} 201; CHECK-NEXT: vzeroupper 202; CHECK-NEXT: retq 203 %1 = bitcast i8 %mask to <8 x i1> 204 %x = load <8 x double>, ptr %src, align 64 205 call void @llvm.x86.avx512.mask.scatter.dpd.512(ptr %stbuf, <8 x i1> %1, <8 x i32>%ind, <8 x double> %x, i32 4) 206 ret void 207} 208 209define dso_local void @scatter_mask_qpd_execdomain(<8 x i64> %ind, ptr %src, i8 %mask, ptr %base, ptr %stbuf) { 210; CHECK-LABEL: scatter_mask_qpd_execdomain: 211; CHECK: # %bb.0: 212; CHECK-NEXT: kmovd %esi, %k1 213; CHECK-NEXT: vmovapd (%rdi), %zmm1 214; CHECK-NEXT: vscatterqpd %zmm1, (%rcx,%zmm0,4) {%k1} 215; CHECK-NEXT: vzeroupper 216; CHECK-NEXT: retq 217 %1 = bitcast i8 %mask to <8 x i1> 218 %x = load <8 x double>, ptr %src, align 64 219 call void @llvm.x86.avx512.mask.scatter.qpd.512(ptr %stbuf, <8 x i1> %1, <8 x i64>%ind, <8 x double> %x, i32 4) 220 ret void 221} 222 223define dso_local void @scatter_mask_dps_execdomain(<16 x i32> %ind, ptr %src, i16 %mask, ptr %base, ptr %stbuf) { 224; CHECK-LABEL: scatter_mask_dps_execdomain: 225; CHECK: # %bb.0: 226; CHECK-NEXT: kmovd %esi, %k1 227; CHECK-NEXT: vmovaps (%rdi), %zmm1 228; CHECK-NEXT: vscatterdps %zmm1, (%rcx,%zmm0,4) {%k1} 229; CHECK-NEXT: vzeroupper 230; CHECK-NEXT: retq 231 %1 = bitcast i16 %mask to <16 x i1> 232 %x = load <16 x float>, ptr %src, align 64 233 call void @llvm.x86.avx512.mask.scatter.dps.512(ptr %stbuf, <16 x i1> %1, <16 x i32>%ind, <16 x float> %x, i32 4) 234 ret void 235} 236 237define dso_local void @scatter_mask_qps_execdomain(<8 x i64> %ind, ptr %src, i8 %mask, ptr %base, ptr %stbuf) { 238; CHECK-LABEL: scatter_mask_qps_execdomain: 239; CHECK: # %bb.0: 240; CHECK-NEXT: kmovd %esi, %k1 241; CHECK-NEXT: vmovaps (%rdi), %ymm1 242; CHECK-NEXT: vscatterqps %ymm1, (%rcx,%zmm0,4) {%k1} 243; CHECK-NEXT: vzeroupper 244; CHECK-NEXT: retq 245 %1 = bitcast i8 %mask to <8 x i1> 246 %x = load <8 x float>, ptr %src, align 32 247 call void @llvm.x86.avx512.mask.scatter.qps.512(ptr %stbuf, <8 x i1> %1, <8 x i64>%ind, <8 x float> %x, i32 4) 248 ret void 249} 250 251define dso_local void @gather_qps(<8 x i64> %ind, <8 x float> %src, ptr %base, ptr %stbuf) { 252; CHECK-LABEL: gather_qps: 253; CHECK: # %bb.0: 254; CHECK-NEXT: kxnorw %k0, %k0, %k1 255; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 256; CHECK-NEXT: kxnorw %k0, %k0, %k2 257; CHECK-NEXT: vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2} 258; CHECK-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 259; CHECK-NEXT: vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1} 260; CHECK-NEXT: vzeroupper 261; CHECK-NEXT: retq 262 %x = call <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float> %src, ptr %base, <8 x i64> %ind, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4) 263 %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3> 264 call void @llvm.x86.avx512.mask.scatter.qps.512(ptr %stbuf, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i64> %ind2, <8 x float> %x, i32 4) 265 ret void 266} 267 268define <2 x double> @test_int_x86_avx512_mask_gather3div2_df(<2 x double> %x0, ptr %x1, <2 x i64> %x2, i8 %x3) { 269; CHECK-LABEL: test_int_x86_avx512_mask_gather3div2_df: 270; CHECK: # %bb.0: 271; CHECK-NEXT: kmovd %esi, %k1 272; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,4), %xmm0 {%k1} 273; CHECK-NEXT: kxnorw %k0, %k0, %k1 274; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 275; CHECK-NEXT: vgatherqpd (%rdi,%xmm1,2), %xmm2 {%k1} 276; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0 277; CHECK-NEXT: retq 278 %1 = bitcast i8 %x3 to <8 x i1> 279 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1> 280 %res = call <2 x double> @llvm.x86.avx512.mask.gather3div2.df(<2 x double> %x0, ptr %x1, <2 x i64> %x2, <2 x i1> %extract, i32 4) 281 %res1 = call <2 x double> @llvm.x86.avx512.mask.gather3div2.df(<2 x double> %x0, ptr %x1, <2 x i64> %x2, <2 x i1> <i1 true, i1 true>, i32 2) 282 %res2 = fadd <2 x double> %res, %res1 283 ret <2 x double> %res2 284} 285 286define <2 x i64> @test_int_x86_avx512_mask_gather3div2_di(<2 x i64> %x0, ptr %x1, <2 x i64> %x2, i8 %x3) { 287; CHECK-LABEL: test_int_x86_avx512_mask_gather3div2_di: 288; CHECK: # %bb.0: 289; CHECK-NEXT: kmovd %esi, %k1 290; CHECK-NEXT: vpgatherqq (%rdi,%xmm1,8), %xmm0 {%k1} 291; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm0 292; CHECK-NEXT: retq 293 %1 = bitcast i8 %x3 to <8 x i1> 294 %extract1 = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1> 295 %res = call <2 x i64> @llvm.x86.avx512.mask.gather3div2.di(<2 x i64> %x0, ptr %x1, <2 x i64> %x2, <2 x i1> %extract1, i32 8) 296 %2 = bitcast i8 %x3 to <8 x i1> 297 %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1> 298 %res1 = call <2 x i64> @llvm.x86.avx512.mask.gather3div2.di(<2 x i64> %x0, ptr %x1, <2 x i64> %x2, <2 x i1> %extract, i32 8) 299 %res2 = add <2 x i64> %res, %res1 300 ret <2 x i64> %res2 301} 302 303define <4 x double> @test_int_x86_avx512_mask_gather3div4_df(<4 x double> %x0, ptr %x1, <4 x i64> %x2, i8 %x3) { 304; CHECK-LABEL: test_int_x86_avx512_mask_gather3div4_df: 305; CHECK: # %bb.0: 306; CHECK-NEXT: kmovd %esi, %k1 307; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,4), %ymm0 {%k1} 308; CHECK-NEXT: kxnorw %k0, %k0, %k1 309; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 310; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,2), %ymm2 {%k1} 311; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0 312; CHECK-NEXT: retq 313 %1 = bitcast i8 %x3 to <8 x i1> 314 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 315 %res = call <4 x double> @llvm.x86.avx512.mask.gather3div4.df(<4 x double> %x0, ptr %x1, <4 x i64> %x2, <4 x i1> %extract, i32 4) 316 %res1 = call <4 x double> @llvm.x86.avx512.mask.gather3div4.df(<4 x double> %x0, ptr %x1, <4 x i64> %x2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 2) 317 %res2 = fadd <4 x double> %res, %res1 318 ret <4 x double> %res2 319} 320 321define <4 x i64> @test_int_x86_avx512_mask_gather3div4_di(<4 x i64> %x0, ptr %x1, <4 x i64> %x2, i8 %x3) { 322; CHECK-LABEL: test_int_x86_avx512_mask_gather3div4_di: 323; CHECK: # %bb.0: 324; CHECK-NEXT: kmovd %esi, %k1 325; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm0 {%k1} 326; CHECK-NEXT: kxnorw %k0, %k0, %k1 327; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 328; CHECK-NEXT: vpgatherqq (%rdi,%ymm1,8), %ymm2 {%k1} 329; CHECK-NEXT: vpaddq %ymm2, %ymm0, %ymm0 330; CHECK-NEXT: retq 331 %1 = bitcast i8 %x3 to <8 x i1> 332 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 333 %res = call <4 x i64> @llvm.x86.avx512.mask.gather3div4.di(<4 x i64> %x0, ptr %x1, <4 x i64> %x2, <4 x i1> %extract, i32 8) 334 %res1 = call <4 x i64> @llvm.x86.avx512.mask.gather3div4.di(<4 x i64> %x0, ptr %x1, <4 x i64> %x2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 8) 335 %res2 = add <4 x i64> %res, %res1 336 ret <4 x i64> %res2 337} 338 339define <4 x float> @test_int_x86_avx512_mask_gather3div4_sf(<4 x float> %x0, ptr %x1, <2 x i64> %x2, i8 %x3) { 340; CHECK-LABEL: test_int_x86_avx512_mask_gather3div4_sf: 341; CHECK: # %bb.0: 342; CHECK-NEXT: kmovd %esi, %k1 343; CHECK-NEXT: vgatherqps (%rdi,%xmm1,4), %xmm0 {%k1} 344; CHECK-NEXT: kxnorw %k0, %k0, %k1 345; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 346; CHECK-NEXT: vgatherqps (%rdi,%xmm1,2), %xmm2 {%k1} 347; CHECK-NEXT: vaddps %xmm2, %xmm0, %xmm0 348; CHECK-NEXT: retq 349 %1 = bitcast i8 %x3 to <8 x i1> 350 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1> 351 %res = call <4 x float> @llvm.x86.avx512.mask.gather3div4.sf(<4 x float> %x0, ptr %x1, <2 x i64> %x2, <2 x i1> %extract, i32 4) 352 %res1 = call <4 x float> @llvm.x86.avx512.mask.gather3div4.sf(<4 x float> %x0, ptr %x1, <2 x i64> %x2, <2 x i1> <i1 true, i1 true>, i32 2) 353 %res2 = fadd <4 x float> %res, %res1 354 ret <4 x float> %res2 355} 356 357define <4 x i32> @test_int_x86_avx512_mask_gather3div4_si(<4 x i32> %x0, ptr %x1, <2 x i64> %x2, i8 %x3) { 358; CHECK-LABEL: test_int_x86_avx512_mask_gather3div4_si: 359; CHECK: # %bb.0: 360; CHECK-NEXT: kxnorw %k0, %k0, %k1 361; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 362; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm2 {%k1} 363; CHECK-NEXT: kmovd %esi, %k1 364; CHECK-NEXT: vpgatherqd (%rdi,%xmm1,4), %xmm0 {%k1} 365; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 366; CHECK-NEXT: retq 367 %res = call <4 x i32> @llvm.x86.avx512.mask.gather3div4.si(<4 x i32> %x0, ptr %x1, <2 x i64> %x2, <2 x i1> <i1 true, i1 true>, i32 4) 368 %1 = bitcast i8 %x3 to <8 x i1> 369 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1> 370 %res1 = call <4 x i32> @llvm.x86.avx512.mask.gather3div4.si(<4 x i32> %x0, ptr %x1, <2 x i64> %x2, <2 x i1> %extract, i32 4) 371 %res2 = add <4 x i32> %res, %res1 372 ret <4 x i32> %res2 373} 374 375define <4 x float> @test_int_x86_avx512_mask_gather3div8_sf(<4 x float> %x0, ptr %x1, <4 x i64> %x2, i8 %x3) { 376; CHECK-LABEL: test_int_x86_avx512_mask_gather3div8_sf: 377; CHECK: # %bb.0: 378; CHECK-NEXT: kmovd %esi, %k1 379; CHECK-NEXT: vgatherqps (%rdi,%ymm1,4), %xmm0 {%k1} 380; CHECK-NEXT: kxnorw %k0, %k0, %k1 381; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 382; CHECK-NEXT: vgatherqps (%rdi,%ymm1,2), %xmm2 {%k1} 383; CHECK-NEXT: vaddps %xmm2, %xmm0, %xmm0 384; CHECK-NEXT: vzeroupper 385; CHECK-NEXT: retq 386 %1 = bitcast i8 %x3 to <8 x i1> 387 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 388 %res = call <4 x float> @llvm.x86.avx512.mask.gather3div8.sf(<4 x float> %x0, ptr %x1, <4 x i64> %x2, <4 x i1> %extract, i32 4) 389 %res1 = call <4 x float> @llvm.x86.avx512.mask.gather3div8.sf(<4 x float> %x0, ptr %x1, <4 x i64> %x2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 2) 390 %res2 = fadd <4 x float> %res, %res1 391 ret <4 x float> %res2 392} 393 394define <4 x i32> @test_int_x86_avx512_mask_gather3div8_si(<4 x i32> %x0, ptr %x1, <4 x i64> %x2, i8 %x3) { 395; CHECK-LABEL: test_int_x86_avx512_mask_gather3div8_si: 396; CHECK: # %bb.0: 397; CHECK-NEXT: kmovd %esi, %k1 398; CHECK-NEXT: vmovdqa %xmm0, %xmm2 399; CHECK-NEXT: kmovq %k1, %k2 400; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,4), %xmm2 {%k2} 401; CHECK-NEXT: vpgatherqd (%rdi,%ymm1,2), %xmm0 {%k1} 402; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 403; CHECK-NEXT: vzeroupper 404; CHECK-NEXT: retq 405 %1 = bitcast i8 %x3 to <8 x i1> 406 %extract1 = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 407 %res = call <4 x i32> @llvm.x86.avx512.mask.gather3div8.si(<4 x i32> %x0, ptr %x1, <4 x i64> %x2, <4 x i1> %extract1, i32 4) 408 %2 = bitcast i8 %x3 to <8 x i1> 409 %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 410 %res1 = call <4 x i32> @llvm.x86.avx512.mask.gather3div8.si(<4 x i32> %x0, ptr %x1, <4 x i64> %x2, <4 x i1> %extract, i32 2) 411 %res2 = add <4 x i32> %res, %res1 412 ret <4 x i32> %res2 413} 414 415define <2 x double> @test_int_x86_avx512_mask_gather3siv2_df(<2 x double> %x0, ptr %x1, <4 x i32> %x2, i8 %x3) { 416; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv2_df: 417; CHECK: # %bb.0: 418; CHECK-NEXT: kmovd %esi, %k1 419; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %xmm0 {%k1} 420; CHECK-NEXT: kxnorw %k0, %k0, %k1 421; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 422; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,2), %xmm2 {%k1} 423; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0 424; CHECK-NEXT: retq 425 %1 = bitcast i8 %x3 to <8 x i1> 426 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1> 427 %res = call <2 x double> @llvm.x86.avx512.mask.gather3siv2.df(<2 x double> %x0, ptr %x1, <4 x i32> %x2, <2 x i1> %extract, i32 4) 428 %res1 = call <2 x double> @llvm.x86.avx512.mask.gather3siv2.df(<2 x double> %x0, ptr %x1, <4 x i32> %x2, <2 x i1> <i1 true, i1 true>, i32 2) 429 %res2 = fadd <2 x double> %res, %res1 430 ret <2 x double> %res2 431} 432 433define <2 x i64> @test_int_x86_avx512_mask_gather3siv2_di(<2 x i64> %x0, ptr %x1, <4 x i32> %x2, i8 %x3) { 434; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv2_di: 435; CHECK: # %bb.0: 436; CHECK-NEXT: kmovd %esi, %k1 437; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %xmm0 {%k1} 438; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm0 439; CHECK-NEXT: retq 440 %1 = bitcast i8 %x3 to <8 x i1> 441 %extract1 = shufflevector <8 x i1> %1, <8 x i1> %1, <2 x i32> <i32 0, i32 1> 442 %res = call <2 x i64> @llvm.x86.avx512.mask.gather3siv2.di(<2 x i64> %x0, ptr %x1, <4 x i32> %x2, <2 x i1> %extract1, i32 8) 443 %2 = bitcast i8 %x3 to <8 x i1> 444 %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1> 445 %res1 = call <2 x i64> @llvm.x86.avx512.mask.gather3siv2.di(<2 x i64> %x0, ptr %x1, <4 x i32> %x2, <2 x i1> %extract, i32 8) 446 %res2 = add <2 x i64> %res, %res1 447 ret <2 x i64> %res2 448} 449 450define <4 x double> @test_int_x86_avx512_mask_gather3siv4_df(<4 x double> %x0, ptr %x1, <4 x i32> %x2, i8 %x3) { 451; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv4_df: 452; CHECK: # %bb.0: 453; CHECK-NEXT: kmovd %esi, %k1 454; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,4), %ymm0 {%k1} 455; CHECK-NEXT: kxnorw %k0, %k0, %k1 456; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 457; CHECK-NEXT: vgatherdpd (%rdi,%xmm1,2), %ymm2 {%k1} 458; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0 459; CHECK-NEXT: retq 460 %1 = bitcast i8 %x3 to <8 x i1> 461 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 462 %res = call <4 x double> @llvm.x86.avx512.mask.gather3siv4.df(<4 x double> %x0, ptr %x1, <4 x i32> %x2, <4 x i1> %extract, i32 4) 463 %res1 = call <4 x double> @llvm.x86.avx512.mask.gather3siv4.df(<4 x double> %x0, ptr %x1, <4 x i32> %x2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 2) 464 %res2 = fadd <4 x double> %res, %res1 465 ret <4 x double> %res2 466} 467 468define <4 x i64> @test_int_x86_avx512_mask_gather3siv4_di(<4 x i64> %x0, ptr %x1, <4 x i32> %x2, i8 %x3) { 469; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv4_di: 470; CHECK: # %bb.0: 471; CHECK-NEXT: kmovd %esi, %k1 472; CHECK-NEXT: vpgatherdq (%rdi,%xmm1,8), %ymm0 {%k1} 473; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm0 474; CHECK-NEXT: retq 475 %1 = bitcast i8 %x3 to <8 x i1> 476 %extract1 = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 477 %res = call <4 x i64> @llvm.x86.avx512.mask.gather3siv4.di(<4 x i64> %x0, ptr %x1, <4 x i32> %x2, <4 x i1> %extract1, i32 8) 478 %2 = bitcast i8 %x3 to <8 x i1> 479 %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 480 %res1 = call <4 x i64> @llvm.x86.avx512.mask.gather3siv4.di(<4 x i64> %x0, ptr %x1, <4 x i32> %x2, <4 x i1> %extract, i32 8) 481 %res2 = add <4 x i64> %res, %res1 482 ret <4 x i64> %res2 483} 484 485define <4 x float> @test_int_x86_avx512_mask_gather3siv4_sf(<4 x float> %x0, ptr %x1, <4 x i32> %x2, i8 %x3) { 486; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv4_sf: 487; CHECK: # %bb.0: 488; CHECK-NEXT: kmovd %esi, %k1 489; CHECK-NEXT: vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1} 490; CHECK-NEXT: kxnorw %k0, %k0, %k1 491; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 492; CHECK-NEXT: vgatherdps (%rdi,%xmm1,2), %xmm2 {%k1} 493; CHECK-NEXT: vaddps %xmm2, %xmm0, %xmm0 494; CHECK-NEXT: retq 495 %1 = bitcast i8 %x3 to <8 x i1> 496 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 497 %res = call <4 x float> @llvm.x86.avx512.mask.gather3siv4.sf(<4 x float> %x0, ptr %x1, <4 x i32> %x2, <4 x i1> %extract, i32 4) 498 %res1 = call <4 x float> @llvm.x86.avx512.mask.gather3siv4.sf(<4 x float> %x0, ptr %x1, <4 x i32> %x2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 2) 499 %res2 = fadd <4 x float> %res, %res1 500 ret <4 x float> %res2 501} 502 503define <4 x i32> @test_int_x86_avx512_mask_gather3siv4_si(<4 x i32> %x0, ptr %x1, <4 x i32> %x2, i8 %x3) { 504; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv4_si: 505; CHECK: # %bb.0: 506; CHECK-NEXT: kxnorw %k0, %k0, %k1 507; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 508; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,4), %xmm2 {%k1} 509; CHECK-NEXT: kmovd %esi, %k1 510; CHECK-NEXT: vpgatherdd (%rdi,%xmm1,2), %xmm0 {%k1} 511; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 512; CHECK-NEXT: retq 513 %res = call <4 x i32> @llvm.x86.avx512.mask.gather3siv4.si(<4 x i32> %x0, ptr %x1, <4 x i32> %x2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, i32 4) 514 %1 = bitcast i8 %x3 to <8 x i1> 515 %extract = shufflevector <8 x i1> %1, <8 x i1> %1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 516 %res1 = call <4 x i32> @llvm.x86.avx512.mask.gather3siv4.si(<4 x i32> %x0, ptr %x1, <4 x i32> %x2, <4 x i1> %extract, i32 2) 517 %res2 = add <4 x i32> %res, %res1 518 ret <4 x i32> %res2 519} 520 521define <8 x float> @test_int_x86_avx512_mask_gather3siv8_sf(<8 x float> %x0, ptr %x1, <8 x i32> %x2, i8 %x3) { 522; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv8_sf: 523; CHECK: # %bb.0: 524; CHECK-NEXT: kmovd %esi, %k1 525; CHECK-NEXT: vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1} 526; CHECK-NEXT: kxnorw %k0, %k0, %k1 527; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 528; CHECK-NEXT: vgatherdps (%rdi,%ymm1,2), %ymm2 {%k1} 529; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0 530; CHECK-NEXT: retq 531 %1 = bitcast i8 %x3 to <8 x i1> 532 %res = call <8 x float> @llvm.x86.avx512.mask.gather3siv8.sf(<8 x float> %x0, ptr %x1, <8 x i32> %x2, <8 x i1> %1, i32 4) 533 %res1 = call <8 x float> @llvm.x86.avx512.mask.gather3siv8.sf(<8 x float> %x0, ptr %x1, <8 x i32> %x2, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 2) 534 %res2 = fadd <8 x float> %res, %res1 535 ret <8 x float> %res2 536} 537 538define <8 x i32> @test_int_x86_avx512_mask_gather3siv8_si(<8 x i32> %x0, ptr %x1, <8 x i32> %x2, i8 %x3) { 539; CHECK-LABEL: test_int_x86_avx512_mask_gather3siv8_si: 540; CHECK: # %bb.0: 541; CHECK-NEXT: kmovd %esi, %k1 542; CHECK-NEXT: vmovdqa %ymm0, %ymm2 543; CHECK-NEXT: kmovq %k1, %k2 544; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,4), %ymm2 {%k2} 545; CHECK-NEXT: vpgatherdd (%rdi,%ymm1,2), %ymm0 {%k1} 546; CHECK-NEXT: vpaddd %ymm0, %ymm2, %ymm0 547; CHECK-NEXT: retq 548 %1 = bitcast i8 %x3 to <8 x i1> 549 %res = call <8 x i32> @llvm.x86.avx512.mask.gather3siv8.si(<8 x i32> %x0, ptr %x1, <8 x i32> %x2, <8 x i1> %1, i32 4) 550 %2 = bitcast i8 %x3 to <8 x i1> 551 %res1 = call <8 x i32> @llvm.x86.avx512.mask.gather3siv8.si(<8 x i32> %x0, ptr %x1, <8 x i32> %x2, <8 x i1> %2, i32 2) 552 %res2 = add <8 x i32> %res, %res1 553 ret <8 x i32> %res2 554} 555 556define dso_local void@test_int_x86_avx512_scatterdiv2_df(ptr %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3) { 557; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_df: 558; CHECK: # %bb.0: 559; CHECK-NEXT: kmovd %esi, %k1 560; CHECK-NEXT: kxnorw %k0, %k0, %k2 561; CHECK-NEXT: vscatterqpd %xmm1, (%rdi,%xmm0,2) {%k2} 562; CHECK-NEXT: vscatterqpd %xmm1, (%rdi,%xmm0,4) {%k1} 563; CHECK-NEXT: retq 564 %1 = bitcast i8 %x1 to <8 x i1> 565 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 566 call void @llvm.x86.avx512.mask.scatterdiv2.df(ptr %x0, <2 x i1> <i1 true, i1 true>, <2 x i64> %x2, <2 x double> %x3, i32 2) 567 call void @llvm.x86.avx512.mask.scatterdiv2.df(ptr %x0, <2 x i1> %2, <2 x i64> %x2, <2 x double> %x3, i32 4) 568 ret void 569} 570 571define dso_local void@test_int_x86_avx512_scatterdiv2_di(ptr %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3) { 572; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_di: 573; CHECK: # %bb.0: 574; CHECK-NEXT: kmovd %esi, %k1 575; CHECK-NEXT: vpscatterqq %xmm1, (%rdi,%xmm0,2) {%k1} 576; CHECK-NEXT: kxnorw %k0, %k0, %k1 577; CHECK-NEXT: vpscatterqq %xmm1, (%rdi,%xmm0,4) {%k1} 578; CHECK-NEXT: retq 579 %1 = bitcast i8 %x1 to <8 x i1> 580 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 581 call void @llvm.x86.avx512.mask.scatterdiv2.di(ptr %x0, <2 x i1> %2, <2 x i64> %x2, <2 x i64> %x3, i32 2) 582 call void @llvm.x86.avx512.mask.scatterdiv2.di(ptr %x0, <2 x i1> <i1 true, i1 true>, <2 x i64> %x2, <2 x i64> %x3, i32 4) 583 ret void 584} 585 586define dso_local void@test_int_x86_avx512_scatterdiv4_df(ptr %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3) { 587; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_df: 588; CHECK: # %bb.0: 589; CHECK-NEXT: kmovd %esi, %k1 590; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,2) {%k1} 591; CHECK-NEXT: kxnorw %k0, %k0, %k1 592; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,4) {%k1} 593; CHECK-NEXT: vzeroupper 594; CHECK-NEXT: retq 595 %1 = bitcast i8 %x1 to <8 x i1> 596 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 597 call void @llvm.x86.avx512.mask.scatterdiv4.df(ptr %x0, <4 x i1> %2, <4 x i64> %x2, <4 x double> %x3, i32 2) 598 call void @llvm.x86.avx512.mask.scatterdiv4.df(ptr %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> %x2, <4 x double> %x3, i32 4) 599 ret void 600} 601 602define dso_local void@test_int_x86_avx512_scatterdiv4_di(ptr %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3) { 603; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_di: 604; CHECK: # %bb.0: 605; CHECK-NEXT: kmovd %esi, %k1 606; CHECK-NEXT: vpscatterqq %ymm1, (%rdi,%ymm0,2) {%k1} 607; CHECK-NEXT: kxnorw %k0, %k0, %k1 608; CHECK-NEXT: vpscatterqq %ymm1, (%rdi,%ymm0,4) {%k1} 609; CHECK-NEXT: vzeroupper 610; CHECK-NEXT: retq 611 %1 = bitcast i8 %x1 to <8 x i1> 612 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 613 call void @llvm.x86.avx512.mask.scatterdiv4.di(ptr %x0, <4 x i1> %2, <4 x i64> %x2, <4 x i64> %x3, i32 2) 614 call void @llvm.x86.avx512.mask.scatterdiv4.di(ptr %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> %x2, <4 x i64> %x3, i32 4) 615 ret void 616} 617 618define dso_local void@test_int_x86_avx512_scatterdiv4_sf(ptr %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3) { 619; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_sf: 620; CHECK: # %bb.0: 621; CHECK-NEXT: kmovd %esi, %k1 622; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%xmm0,2) {%k1} 623; CHECK-NEXT: kxnorw %k0, %k0, %k1 624; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%xmm0,4) {%k1} 625; CHECK-NEXT: retq 626 %1 = bitcast i8 %x1 to <8 x i1> 627 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 628 call void @llvm.x86.avx512.mask.scatterdiv4.sf(ptr %x0, <2 x i1> %2, <2 x i64> %x2, <4 x float> %x3, i32 2) 629 call void @llvm.x86.avx512.mask.scatterdiv4.sf(ptr %x0, <2 x i1> <i1 true, i1 true>, <2 x i64> %x2, <4 x float> %x3, i32 4) 630 ret void 631} 632 633define dso_local void@test_int_x86_avx512_scatterdiv4_si(ptr %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3) { 634; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_si: 635; CHECK: # %bb.0: 636; CHECK-NEXT: kmovd %esi, %k1 637; CHECK-NEXT: kxnorw %k0, %k0, %k2 638; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%xmm0,2) {%k2} 639; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%xmm0,4) {%k1} 640; CHECK-NEXT: retq 641 %1 = bitcast i8 %x1 to <8 x i1> 642 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 643 call void @llvm.x86.avx512.mask.scatterdiv4.si(ptr %x0, <2 x i1> <i1 true, i1 true>, <2 x i64> %x2, <4 x i32> %x3, i32 2) 644 call void @llvm.x86.avx512.mask.scatterdiv4.si(ptr %x0, <2 x i1> %2, <2 x i64> %x2, <4 x i32> %x3, i32 4) 645 ret void 646} 647 648define dso_local void@test_int_x86_avx512_scatterdiv8_sf(ptr %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3) { 649; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_sf: 650; CHECK: # %bb.0: 651; CHECK-NEXT: kmovd %esi, %k1 652; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%ymm0,2) {%k1} 653; CHECK-NEXT: kxnorw %k0, %k0, %k1 654; CHECK-NEXT: vscatterqps %xmm1, (%rdi,%ymm0,4) {%k1} 655; CHECK-NEXT: vzeroupper 656; CHECK-NEXT: retq 657 %1 = bitcast i8 %x1 to <8 x i1> 658 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 659 call void @llvm.x86.avx512.mask.scatterdiv8.sf(ptr %x0, <4 x i1> %2, <4 x i64> %x2, <4 x float> %x3, i32 2) 660 call void @llvm.x86.avx512.mask.scatterdiv8.sf(ptr %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> %x2, <4 x float> %x3, i32 4) 661 ret void 662} 663 664define dso_local void@test_int_x86_avx512_scatterdiv8_si(ptr %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3) { 665; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_si: 666; CHECK: # %bb.0: 667; CHECK-NEXT: kmovd %esi, %k1 668; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%ymm0,2) {%k1} 669; CHECK-NEXT: kxnorw %k0, %k0, %k1 670; CHECK-NEXT: vpscatterqd %xmm1, (%rdi,%ymm0,4) {%k1} 671; CHECK-NEXT: vzeroupper 672; CHECK-NEXT: retq 673 %1 = bitcast i8 %x1 to <8 x i1> 674 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 675 call void @llvm.x86.avx512.mask.scatterdiv8.si(ptr %x0, <4 x i1> %2, <4 x i64> %x2, <4 x i32> %x3, i32 2) 676 call void @llvm.x86.avx512.mask.scatterdiv8.si(ptr %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i64> %x2, <4 x i32> %x3, i32 4) 677 ret void 678} 679 680define dso_local void@test_int_x86_avx512_scattersiv2_df(ptr %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3) { 681; CHECK-LABEL: test_int_x86_avx512_scattersiv2_df: 682; CHECK: # %bb.0: 683; CHECK-NEXT: kmovd %esi, %k1 684; CHECK-NEXT: kxnorw %k0, %k0, %k2 685; CHECK-NEXT: vscatterdpd %xmm1, (%rdi,%xmm0,2) {%k2} 686; CHECK-NEXT: vscatterdpd %xmm1, (%rdi,%xmm0,4) {%k1} 687; CHECK-NEXT: retq 688 %1 = bitcast i8 %x1 to <8 x i1> 689 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 690 call void @llvm.x86.avx512.mask.scattersiv2.df(ptr %x0, <2 x i1> <i1 true, i1 true>, <4 x i32> %x2, <2 x double> %x3, i32 2) 691 call void @llvm.x86.avx512.mask.scattersiv2.df(ptr %x0, <2 x i1> %2, <4 x i32> %x2, <2 x double> %x3, i32 4) 692 ret void 693} 694 695define dso_local void@test_int_x86_avx512_scattersiv2_di(ptr %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3) { 696; CHECK-LABEL: test_int_x86_avx512_scattersiv2_di: 697; CHECK: # %bb.0: 698; CHECK-NEXT: kmovd %esi, %k1 699; CHECK-NEXT: kxnorw %k0, %k0, %k2 700; CHECK-NEXT: vpscatterdq %xmm1, (%rdi,%xmm0,2) {%k2} 701; CHECK-NEXT: vpscatterdq %xmm1, (%rdi,%xmm0,4) {%k1} 702; CHECK-NEXT: retq 703 %1 = bitcast i8 %x1 to <8 x i1> 704 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1> 705 call void @llvm.x86.avx512.mask.scattersiv2.di(ptr %x0, <2 x i1> <i1 true, i1 true>, <4 x i32> %x2, <2 x i64> %x3, i32 2) 706 call void @llvm.x86.avx512.mask.scattersiv2.di(ptr %x0, <2 x i1> %2, <4 x i32> %x2, <2 x i64> %x3, i32 4) 707 ret void 708} 709 710define dso_local void@test_int_x86_avx512_scattersiv4_df(ptr %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3) { 711; CHECK-LABEL: test_int_x86_avx512_scattersiv4_df: 712; CHECK: # %bb.0: 713; CHECK-NEXT: kmovd %esi, %k1 714; CHECK-NEXT: vscatterdpd %ymm1, (%rdi,%xmm0,2) {%k1} 715; CHECK-NEXT: kxnorw %k0, %k0, %k1 716; CHECK-NEXT: vscatterdpd %ymm1, (%rdi,%xmm0,4) {%k1} 717; CHECK-NEXT: vzeroupper 718; CHECK-NEXT: retq 719 %1 = bitcast i8 %x1 to <8 x i1> 720 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 721 call void @llvm.x86.avx512.mask.scattersiv4.df(ptr %x0, <4 x i1> %2, <4 x i32> %x2, <4 x double> %x3, i32 2) 722 call void @llvm.x86.avx512.mask.scattersiv4.df(ptr %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %x2, <4 x double> %x3, i32 4) 723 ret void 724} 725 726define dso_local void@test_int_x86_avx512_scattersiv4_di(ptr %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3) { 727; CHECK-LABEL: test_int_x86_avx512_scattersiv4_di: 728; CHECK: # %bb.0: 729; CHECK-NEXT: kmovd %esi, %k1 730; CHECK-NEXT: kxnorw %k0, %k0, %k2 731; CHECK-NEXT: vpscatterdq %ymm1, (%rdi,%xmm0,2) {%k2} 732; CHECK-NEXT: vpscatterdq %ymm1, (%rdi,%xmm0,4) {%k1} 733; CHECK-NEXT: vzeroupper 734; CHECK-NEXT: retq 735 %1 = bitcast i8 %x1 to <8 x i1> 736 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 737 call void @llvm.x86.avx512.mask.scattersiv4.di(ptr %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %x2, <4 x i64> %x3, i32 2) 738 call void @llvm.x86.avx512.mask.scattersiv4.di(ptr %x0, <4 x i1> %2, <4 x i32> %x2, <4 x i64> %x3, i32 4) 739 ret void 740} 741 742define dso_local void@test_int_x86_avx512_scattersiv4_sf(ptr %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3) { 743; CHECK-LABEL: test_int_x86_avx512_scattersiv4_sf: 744; CHECK: # %bb.0: 745; CHECK-NEXT: kmovd %esi, %k1 746; CHECK-NEXT: vscatterdps %xmm1, (%rdi,%xmm0,2) {%k1} 747; CHECK-NEXT: kxnorw %k0, %k0, %k1 748; CHECK-NEXT: vscatterdps %xmm1, (%rdi,%xmm0,4) {%k1} 749; CHECK-NEXT: retq 750 %1 = bitcast i8 %x1 to <8 x i1> 751 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 752 call void @llvm.x86.avx512.mask.scattersiv4.sf(ptr %x0, <4 x i1> %2, <4 x i32> %x2, <4 x float> %x3, i32 2) 753 call void @llvm.x86.avx512.mask.scattersiv4.sf(ptr %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %x2, <4 x float> %x3, i32 4) 754 ret void 755} 756 757define dso_local void@test_int_x86_avx512_scattersiv4_si(ptr %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3) { 758; CHECK-LABEL: test_int_x86_avx512_scattersiv4_si: 759; CHECK: # %bb.0: 760; CHECK-NEXT: kmovd %esi, %k1 761; CHECK-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,2) {%k1} 762; CHECK-NEXT: kxnorw %k0, %k0, %k1 763; CHECK-NEXT: vpscatterdd %xmm1, (%rdi,%xmm0,4) {%k1} 764; CHECK-NEXT: retq 765 %1 = bitcast i8 %x1 to <8 x i1> 766 %2 = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 767 call void @llvm.x86.avx512.mask.scattersiv4.si(ptr %x0, <4 x i1> %2, <4 x i32> %x2, <4 x i32> %x3, i32 2) 768 call void @llvm.x86.avx512.mask.scattersiv4.si(ptr %x0, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %x2, <4 x i32> %x3, i32 4) 769 ret void 770} 771 772define dso_local void@test_int_x86_avx512_scattersiv8_sf(ptr %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3) { 773; CHECK-LABEL: test_int_x86_avx512_scattersiv8_sf: 774; CHECK: # %bb.0: 775; CHECK-NEXT: kmovd %esi, %k1 776; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1} 777; CHECK-NEXT: kxnorw %k0, %k0, %k1 778; CHECK-NEXT: vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1} 779; CHECK-NEXT: vzeroupper 780; CHECK-NEXT: retq 781 %1 = bitcast i8 %x1 to <8 x i1> 782 call void @llvm.x86.avx512.mask.scattersiv8.sf(ptr %x0, <8 x i1> %1, <8 x i32> %x2, <8 x float> %x3, i32 2) 783 call void @llvm.x86.avx512.mask.scattersiv8.sf(ptr %x0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %x2, <8 x float> %x3, i32 4) 784 ret void 785} 786 787define dso_local void@test_int_x86_avx512_scattersiv8_si(ptr %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3) { 788; CHECK-LABEL: test_int_x86_avx512_scattersiv8_si: 789; CHECK: # %bb.0: 790; CHECK-NEXT: kmovd %esi, %k1 791; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1} 792; CHECK-NEXT: kxnorw %k0, %k0, %k1 793; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} 794; CHECK-NEXT: vzeroupper 795; CHECK-NEXT: retq 796 %1 = bitcast i8 %x1 to <8 x i1> 797 call void @llvm.x86.avx512.mask.scattersiv8.si(ptr %x0, <8 x i1> %1, <8 x i32> %x2, <8 x i32> %x3, i32 2) 798 call void @llvm.x86.avx512.mask.scattersiv8.si(ptr %x0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %x2, <8 x i32> %x3, i32 4) 799 ret void 800} 801 802define dso_local void @scatter_mask_test(ptr %x0, <8 x i32> %x2, <8 x i32> %x3) { 803; CHECK-LABEL: scatter_mask_test: 804; CHECK: # %bb.0: 805; CHECK-NEXT: kxnorw %k0, %k0, %k1 806; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1} 807; CHECK-NEXT: kxorw %k0, %k0, %k1 808; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} 809; CHECK-NEXT: movb $1, %al 810; CHECK-NEXT: kmovd %eax, %k1 811; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1} 812; CHECK-NEXT: movb $96, %al 813; CHECK-NEXT: kmovd %eax, %k1 814; CHECK-NEXT: vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1} 815; CHECK-NEXT: vzeroupper 816; CHECK-NEXT: retq 817 call void @llvm.x86.avx512.mask.scattersiv8.si(ptr %x0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> %x2, <8 x i32> %x3, i32 2) 818 call void @llvm.x86.avx512.mask.scattersiv8.si(ptr %x0, <8 x i1> zeroinitializer, <8 x i32> %x2, <8 x i32> %x3, i32 4) 819 call void @llvm.x86.avx512.mask.scattersiv8.si(ptr %x0, <8 x i1> bitcast (<1 x i8> <i8 1> to <8 x i1>), <8 x i32> %x2, <8 x i32> %x3, i32 2) 820 call void @llvm.x86.avx512.mask.scattersiv8.si(ptr %x0, <8 x i1> bitcast (<1 x i8> <i8 96> to <8 x i1>), <8 x i32> %x2, <8 x i32> %x3, i32 4) 821 ret void 822} 823 824define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, ptr %base) { 825; CHECK-LABEL: gather_mask_test: 826; CHECK: # %bb.0: 827; CHECK-NEXT: kxnorw %k0, %k0, %k1 828; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 829; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm2 {%k1} 830; CHECK-NEXT: kxorw %k0, %k0, %k1 831; CHECK-NEXT: vmovaps %zmm1, %zmm3 832; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1} 833; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm2 834; CHECK-NEXT: movw $1, %ax 835; CHECK-NEXT: kmovd %eax, %k1 836; CHECK-NEXT: vmovaps %zmm1, %zmm3 837; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1} 838; CHECK-NEXT: movw $220, %ax 839; CHECK-NEXT: kmovd %eax, %k1 840; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} 841; CHECK-NEXT: vaddps %zmm3, %zmm1, %zmm0 842; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0 843; CHECK-NEXT: retq 844 %res = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, ptr %base, <16 x i32> %ind, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4) 845 %res1 = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, ptr %base, <16 x i32> %ind, <16 x i1> zeroinitializer, i32 4) 846 %res2 = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, ptr %base, <16 x i32> %ind, <16 x i1> bitcast (<1 x i16> <i16 1> to <16 x i1>), i32 4) 847 %res3 = call <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float> %src, ptr %base, <16 x i32> %ind, <16 x i1> bitcast (<1 x i16> <i16 220> to <16 x i1>), i32 4) 848 %res4 = fadd <16 x float> %res, %res1 849 %res5 = fadd <16 x float> %res3, %res2 850 %res6 = fadd <16 x float> %res5, %res4 851 ret <16 x float> %res6 852} 853 854@x = dso_local global [1024 x float] zeroinitializer, align 16 855 856define <8 x float> @gather_global(<8 x i64>, ptr nocapture readnone) { 857; CHECK-LABEL: gather_global: 858; CHECK: # %bb.0: 859; CHECK-NEXT: kxnorw %k0, %k0, %k1 860; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 861; CHECK-NEXT: vgatherqps x(,%zmm0,4), %ymm1 {%k1} 862; CHECK-NEXT: vmovaps %ymm1, %ymm0 863; CHECK-NEXT: retq 864 %3 = tail call <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float> zeroinitializer, ptr @x, <8 x i64> %0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i32 4) 865 ret <8 x float> %3 866} 867 868declare <16 x float> @llvm.x86.avx512.mask.gather.dps.512(<16 x float>, ptr, <16 x i32>, <16 x i1>, i32) 869declare <8 x double> @llvm.x86.avx512.mask.gather.dpd.512(<8 x double>, ptr, <8 x i32>, <8 x i1>, i32) 870declare <8 x float> @llvm.x86.avx512.mask.gather.qps.512(<8 x float>, ptr, <8 x i64>, <8 x i1>, i32) 871declare <8 x double> @llvm.x86.avx512.mask.gather.qpd.512(<8 x double>, ptr, <8 x i64>, <8 x i1>, i32) 872declare <16 x i32> @llvm.x86.avx512.mask.gather.dpi.512(<16 x i32>, ptr, <16 x i32>, <16 x i1>, i32) 873declare <8 x i64> @llvm.x86.avx512.mask.gather.dpq.512(<8 x i64>, ptr, <8 x i32>, <8 x i1>, i32) 874declare <8 x i32> @llvm.x86.avx512.mask.gather.qpi.512(<8 x i32>, ptr, <8 x i64>, <8 x i1>, i32) 875declare <8 x i64> @llvm.x86.avx512.mask.gather.qpq.512(<8 x i64>, ptr, <8 x i64>, <8 x i1>, i32) 876declare <2 x double> @llvm.x86.avx512.mask.gather3div2.df(<2 x double>, ptr, <2 x i64>, <2 x i1>, i32) 877declare <2 x i64> @llvm.x86.avx512.mask.gather3div2.di(<2 x i64>, ptr, <2 x i64>, <2 x i1>, i32) 878declare <4 x double> @llvm.x86.avx512.mask.gather3div4.df(<4 x double>, ptr, <4 x i64>, <4 x i1>, i32) 879declare <4 x i64> @llvm.x86.avx512.mask.gather3div4.di(<4 x i64>, ptr, <4 x i64>, <4 x i1>, i32) 880declare <4 x float> @llvm.x86.avx512.mask.gather3div4.sf(<4 x float>, ptr, <2 x i64>, <2 x i1>, i32) 881declare <4 x i32> @llvm.x86.avx512.mask.gather3div4.si(<4 x i32>, ptr, <2 x i64>, <2 x i1>, i32) 882declare <4 x float> @llvm.x86.avx512.mask.gather3div8.sf(<4 x float>, ptr, <4 x i64>, <4 x i1>, i32) 883declare <4 x i32> @llvm.x86.avx512.mask.gather3div8.si(<4 x i32>, ptr, <4 x i64>, <4 x i1>, i32) 884declare <2 x double> @llvm.x86.avx512.mask.gather3siv2.df(<2 x double>, ptr, <4 x i32>, <2 x i1>, i32) 885declare <2 x i64> @llvm.x86.avx512.mask.gather3siv2.di(<2 x i64>, ptr, <4 x i32>, <2 x i1>, i32) 886declare <4 x double> @llvm.x86.avx512.mask.gather3siv4.df(<4 x double>, ptr, <4 x i32>, <4 x i1>, i32) 887declare <4 x i64> @llvm.x86.avx512.mask.gather3siv4.di(<4 x i64>, ptr, <4 x i32>, <4 x i1>, i32) 888declare <4 x float> @llvm.x86.avx512.mask.gather3siv4.sf(<4 x float>, ptr, <4 x i32>, <4 x i1>, i32) 889declare <4 x i32> @llvm.x86.avx512.mask.gather3siv4.si(<4 x i32>, ptr, <4 x i32>, <4 x i1>, i32) 890declare <8 x float> @llvm.x86.avx512.mask.gather3siv8.sf(<8 x float>, ptr, <8 x i32>, <8 x i1>, i32) 891declare <8 x i32> @llvm.x86.avx512.mask.gather3siv8.si(<8 x i32>, ptr, <8 x i32>, <8 x i1>, i32) 892declare void @llvm.x86.avx512.mask.scatter.dps.512(ptr, <16 x i1>, <16 x i32>, <16 x float>, i32) 893declare void @llvm.x86.avx512.mask.scatter.dpd.512(ptr, <8 x i1>, <8 x i32>, <8 x double>, i32) 894declare void @llvm.x86.avx512.mask.scatter.qps.512(ptr, <8 x i1>, <8 x i64>, <8 x float>, i32) 895declare void @llvm.x86.avx512.mask.scatter.qpd.512(ptr, <8 x i1>, <8 x i64>, <8 x double>, i32) 896declare void @llvm.x86.avx512.mask.scatter.dpi.512(ptr, <16 x i1>, <16 x i32>, <16 x i32>, i32) 897declare void @llvm.x86.avx512.mask.scatter.dpq.512(ptr, <8 x i1>, <8 x i32>, <8 x i64>, i32) 898declare void @llvm.x86.avx512.mask.scatter.qpi.512(ptr, <8 x i1>, <8 x i64>, <8 x i32>, i32) 899declare void @llvm.x86.avx512.mask.scatter.qpq.512(ptr, <8 x i1>, <8 x i64>, <8 x i64>, i32) 900declare void @llvm.x86.avx512.mask.scatterdiv2.df(ptr, <2 x i1>, <2 x i64>, <2 x double>, i32) 901declare void @llvm.x86.avx512.mask.scatterdiv2.di(ptr, <2 x i1>, <2 x i64>, <2 x i64>, i32) 902declare void @llvm.x86.avx512.mask.scatterdiv4.df(ptr, <4 x i1>, <4 x i64>, <4 x double>, i32) 903declare void @llvm.x86.avx512.mask.scatterdiv4.di(ptr, <4 x i1>, <4 x i64>, <4 x i64>, i32) 904declare void @llvm.x86.avx512.mask.scatterdiv4.sf(ptr, <2 x i1>, <2 x i64>, <4 x float>, i32) 905declare void @llvm.x86.avx512.mask.scatterdiv4.si(ptr, <2 x i1>, <2 x i64>, <4 x i32>, i32) 906declare void @llvm.x86.avx512.mask.scatterdiv8.sf(ptr, <4 x i1>, <4 x i64>, <4 x float>, i32) 907declare void @llvm.x86.avx512.mask.scatterdiv8.si(ptr, <4 x i1>, <4 x i64>, <4 x i32>, i32) 908declare void @llvm.x86.avx512.mask.scattersiv2.df(ptr, <2 x i1>, <4 x i32>, <2 x double>, i32) 909declare void @llvm.x86.avx512.mask.scattersiv2.di(ptr, <2 x i1>, <4 x i32>, <2 x i64>, i32) 910declare void @llvm.x86.avx512.mask.scattersiv4.df(ptr, <4 x i1>, <4 x i32>, <4 x double>, i32) 911declare void @llvm.x86.avx512.mask.scattersiv4.di(ptr, <4 x i1>, <4 x i32>, <4 x i64>, i32) 912declare void @llvm.x86.avx512.mask.scattersiv4.sf(ptr, <4 x i1>, <4 x i32>, <4 x float>, i32) 913declare void @llvm.x86.avx512.mask.scattersiv4.si(ptr, <4 x i1>, <4 x i32>, <4 x i32>, i32) 914declare void @llvm.x86.avx512.mask.scattersiv8.sf(ptr, <8 x i1>, <8 x i32>, <8 x float>, i32) 915declare void @llvm.x86.avx512.mask.scattersiv8.si(ptr, <8 x i1>, <8 x i32>, <8 x i32>, i32) 916 917