1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 3; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 5 6target triple = "aarch64-unknown-linux-gnu" 7 8; 9; LD1B 10; 11 12define void @masked_gather_v2i8(ptr %a, ptr %b) vscale_range(2,0) #0 { 13; CHECK-LABEL: masked_gather_v2i8: 14; CHECK: // %bb.0: 15; CHECK-NEXT: ldrb w8, [x0] 16; CHECK-NEXT: ldrb w9, [x0, #1] 17; CHECK-NEXT: ptrue p0.d, vl2 18; CHECK-NEXT: fmov s0, w8 19; CHECK-NEXT: mov v0.s[1], w9 20; CHECK-NEXT: cmeq v0.2s, v0.2s, #0 21; CHECK-NEXT: sshll v0.2d, v0.2s, #0 22; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 23; CHECK-NEXT: ldr q0, [x1] 24; CHECK-NEXT: ld1b { z0.d }, p0/z, [z0.d] 25; CHECK-NEXT: ptrue p0.s, vl2 26; CHECK-NEXT: xtn v0.2s, v0.2d 27; CHECK-NEXT: st1b { z0.s }, p0, [x0] 28; CHECK-NEXT: ret 29 %cval = load <2 x i8>, ptr %a 30 %ptrs = load <2 x ptr>, ptr %b 31 %mask = icmp eq <2 x i8> %cval, zeroinitializer 32 %vals = call <2 x i8> @llvm.masked.gather.v2i8(<2 x ptr> %ptrs, i32 8, <2 x i1> %mask, <2 x i8> undef) 33 store <2 x i8> %vals, ptr %a 34 ret void 35} 36 37define void @masked_gather_v4i8(ptr %a, ptr %b) vscale_range(2,0) #0 { 38; CHECK-LABEL: masked_gather_v4i8: 39; CHECK: // %bb.0: 40; CHECK-NEXT: ldr s0, [x0] 41; CHECK-NEXT: ptrue p0.d, vl4 42; CHECK-NEXT: ushll v0.8h, v0.8b, #0 43; CHECK-NEXT: cmeq v0.4h, v0.4h, #0 44; CHECK-NEXT: sunpklo z0.s, z0.h 45; CHECK-NEXT: sunpklo z0.d, z0.s 46; CHECK-NEXT: cmpne p1.d, p0/z, z0.d, #0 47; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] 48; CHECK-NEXT: ld1b { z0.d }, p1/z, [z0.d] 49; CHECK-NEXT: st1b { z0.d }, p0, [x0] 50; CHECK-NEXT: ret 51 %cval = load <4 x i8>, ptr %a 52 %ptrs = load <4 x ptr>, ptr %b 53 %mask = icmp eq <4 x i8> %cval, zeroinitializer 54 %vals = call <4 x i8> @llvm.masked.gather.v4i8(<4 x ptr> %ptrs, i32 8, <4 x i1> %mask, <4 x i8> undef) 55 store <4 x i8> %vals, ptr %a 56 ret void 57} 58 59define void @masked_gather_v8i8(ptr %a, ptr %b) #0 { 60; VBITS_GE_256-LABEL: masked_gather_v8i8: 61; VBITS_GE_256: // %bb.0: 62; VBITS_GE_256-NEXT: ldr d0, [x0] 63; VBITS_GE_256-NEXT: ptrue p0.d, vl4 64; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 65; VBITS_GE_256-NEXT: cmeq v0.8b, v0.8b, #0 66; VBITS_GE_256-NEXT: zip2 v1.8b, v0.8b, v0.8b 67; VBITS_GE_256-NEXT: zip1 v0.8b, v0.8b, v0.8b 68; VBITS_GE_256-NEXT: shl v1.4h, v1.4h, #8 69; VBITS_GE_256-NEXT: shl v0.4h, v0.4h, #8 70; VBITS_GE_256-NEXT: sshr v1.4h, v1.4h, #8 71; VBITS_GE_256-NEXT: sshr v0.4h, v0.4h, #8 72; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h 73; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h 74; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s 75; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s 76; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z1.d, #0 77; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] 78; VBITS_GE_256-NEXT: ld1b { z1.d }, p1/z, [z1.d] 79; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z0.d, #0 80; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1] 81; VBITS_GE_256-NEXT: ld1b { z0.d }, p1/z, [z0.d] 82; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s 83; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h 84; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s 85; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h 86; VBITS_GE_256-NEXT: uzp1 v0.8b, v0.8b, v1.8b 87; VBITS_GE_256-NEXT: str d0, [x0] 88; VBITS_GE_256-NEXT: ret 89; 90; VBITS_GE_512-LABEL: masked_gather_v8i8: 91; VBITS_GE_512: // %bb.0: 92; VBITS_GE_512-NEXT: ldr d0, [x0] 93; VBITS_GE_512-NEXT: ptrue p0.d, vl8 94; VBITS_GE_512-NEXT: cmeq v0.8b, v0.8b, #0 95; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b 96; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h 97; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s 98; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 99; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] 100; VBITS_GE_512-NEXT: ld1b { z0.d }, p1/z, [z0.d] 101; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s 102; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h 103; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b 104; VBITS_GE_512-NEXT: str d0, [x0] 105; VBITS_GE_512-NEXT: ret 106 %cval = load <8 x i8>, ptr %a 107 %ptrs = load <8 x ptr>, ptr %b 108 %mask = icmp eq <8 x i8> %cval, zeroinitializer 109 %vals = call <8 x i8> @llvm.masked.gather.v8i8(<8 x ptr> %ptrs, i32 8, <8 x i1> %mask, <8 x i8> undef) 110 store <8 x i8> %vals, ptr %a 111 ret void 112} 113 114define void @masked_gather_v16i8(ptr %a, ptr %b) vscale_range(8,0) #0 { 115; CHECK-LABEL: masked_gather_v16i8: 116; CHECK: // %bb.0: 117; CHECK-NEXT: ldr q0, [x0] 118; CHECK-NEXT: ptrue p0.d, vl16 119; CHECK-NEXT: cmeq v0.16b, v0.16b, #0 120; CHECK-NEXT: sunpklo z0.h, z0.b 121; CHECK-NEXT: sunpklo z0.s, z0.h 122; CHECK-NEXT: sunpklo z0.d, z0.s 123; CHECK-NEXT: cmpne p1.d, p0/z, z0.d, #0 124; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] 125; CHECK-NEXT: ld1b { z0.d }, p1/z, [z0.d] 126; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s 127; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h 128; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b 129; CHECK-NEXT: str q0, [x0] 130; CHECK-NEXT: ret 131 %cval = load <16 x i8>, ptr %a 132 %ptrs = load <16 x ptr>, ptr %b 133 %mask = icmp eq <16 x i8> %cval, zeroinitializer 134 %vals = call <16 x i8> @llvm.masked.gather.v16i8(<16 x ptr> %ptrs, i32 8, <16 x i1> %mask, <16 x i8> undef) 135 store <16 x i8> %vals, ptr %a 136 ret void 137} 138 139define void @masked_gather_v32i8(ptr %a, ptr %b) vscale_range(16,0) #0 { 140; CHECK-LABEL: masked_gather_v32i8: 141; CHECK: // %bb.0: 142; CHECK-NEXT: ptrue p0.b, vl32 143; CHECK-NEXT: ptrue p1.d, vl32 144; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 145; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 146; CHECK-NEXT: ld1d { z0.d }, p1/z, [x1] 147; CHECK-NEXT: punpklo p0.h, p0.b 148; CHECK-NEXT: punpklo p0.h, p0.b 149; CHECK-NEXT: punpklo p0.h, p0.b 150; CHECK-NEXT: ld1b { z0.d }, p0/z, [z0.d] 151; CHECK-NEXT: st1b { z0.d }, p1, [x0] 152; CHECK-NEXT: ret 153 %cval = load <32 x i8>, ptr %a 154 %ptrs = load <32 x ptr>, ptr %b 155 %mask = icmp eq <32 x i8> %cval, zeroinitializer 156 %vals = call <32 x i8> @llvm.masked.gather.v32i8(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x i8> undef) 157 store <32 x i8> %vals, ptr %a 158 ret void 159} 160 161; 162; LD1H 163; 164 165define void @masked_gather_v2i16(ptr %a, ptr %b) vscale_range(2,0) #0 { 166; CHECK-LABEL: masked_gather_v2i16: 167; CHECK: // %bb.0: 168; CHECK-NEXT: ldrh w8, [x0] 169; CHECK-NEXT: ldrh w9, [x0, #2] 170; CHECK-NEXT: ptrue p0.d, vl2 171; CHECK-NEXT: fmov s0, w8 172; CHECK-NEXT: mov v0.s[1], w9 173; CHECK-NEXT: cmeq v0.2s, v0.2s, #0 174; CHECK-NEXT: sshll v0.2d, v0.2s, #0 175; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 176; CHECK-NEXT: ldr q0, [x1] 177; CHECK-NEXT: ld1h { z0.d }, p0/z, [z0.d] 178; CHECK-NEXT: ptrue p0.s, vl2 179; CHECK-NEXT: xtn v0.2s, v0.2d 180; CHECK-NEXT: st1h { z0.s }, p0, [x0] 181; CHECK-NEXT: ret 182 %cval = load <2 x i16>, ptr %a 183 %ptrs = load <2 x ptr>, ptr %b 184 %mask = icmp eq <2 x i16> %cval, zeroinitializer 185 %vals = call <2 x i16> @llvm.masked.gather.v2i16(<2 x ptr> %ptrs, i32 8, <2 x i1> %mask, <2 x i16> undef) 186 store <2 x i16> %vals, ptr %a 187 ret void 188} 189 190define void @masked_gather_v4i16(ptr %a, ptr %b) vscale_range(2,0) #0 { 191; CHECK-LABEL: masked_gather_v4i16: 192; CHECK: // %bb.0: 193; CHECK-NEXT: ldr d0, [x0] 194; CHECK-NEXT: ptrue p0.d, vl4 195; CHECK-NEXT: cmeq v0.4h, v0.4h, #0 196; CHECK-NEXT: sunpklo z0.s, z0.h 197; CHECK-NEXT: sunpklo z0.d, z0.s 198; CHECK-NEXT: cmpne p1.d, p0/z, z0.d, #0 199; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] 200; CHECK-NEXT: ld1h { z0.d }, p1/z, [z0.d] 201; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s 202; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h 203; CHECK-NEXT: str d0, [x0] 204; CHECK-NEXT: ret 205 %cval = load <4 x i16>, ptr %a 206 %ptrs = load <4 x ptr>, ptr %b 207 %mask = icmp eq <4 x i16> %cval, zeroinitializer 208 %vals = call <4 x i16> @llvm.masked.gather.v4i16(<4 x ptr> %ptrs, i32 8, <4 x i1> %mask, <4 x i16> undef) 209 store <4 x i16> %vals, ptr %a 210 ret void 211} 212 213define void @masked_gather_v8i16(ptr %a, ptr %b) #0 { 214; VBITS_GE_256-LABEL: masked_gather_v8i16: 215; VBITS_GE_256: // %bb.0: 216; VBITS_GE_256-NEXT: ldr q0, [x0] 217; VBITS_GE_256-NEXT: ptrue p0.d, vl4 218; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 219; VBITS_GE_256-NEXT: cmeq v0.8h, v0.8h, #0 220; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h 221; VBITS_GE_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8 222; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s 223; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h 224; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z1.d, #0 225; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s 226; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] 227; VBITS_GE_256-NEXT: ld1h { z1.d }, p1/z, [z1.d] 228; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z0.d, #0 229; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] 230; VBITS_GE_256-NEXT: ld1h { z0.d }, p1/z, [z0.d] 231; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s 232; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h 233; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s 234; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h 235; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0] 236; VBITS_GE_256-NEXT: str q1, [x0] 237; VBITS_GE_256-NEXT: ret 238; 239; VBITS_GE_512-LABEL: masked_gather_v8i16: 240; VBITS_GE_512: // %bb.0: 241; VBITS_GE_512-NEXT: ldr q0, [x0] 242; VBITS_GE_512-NEXT: ptrue p0.d, vl8 243; VBITS_GE_512-NEXT: cmeq v0.8h, v0.8h, #0 244; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h 245; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s 246; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 247; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] 248; VBITS_GE_512-NEXT: ld1h { z0.d }, p1/z, [z0.d] 249; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s 250; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h 251; VBITS_GE_512-NEXT: str q0, [x0] 252; VBITS_GE_512-NEXT: ret 253 %cval = load <8 x i16>, ptr %a 254 %ptrs = load <8 x ptr>, ptr %b 255 %mask = icmp eq <8 x i16> %cval, zeroinitializer 256 %vals = call <8 x i16> @llvm.masked.gather.v8i16(<8 x ptr> %ptrs, i32 8, <8 x i1> %mask, <8 x i16> undef) 257 store <8 x i16> %vals, ptr %a 258 ret void 259} 260 261define void @masked_gather_v16i16(ptr %a, ptr %b) vscale_range(8,0) #0 { 262; CHECK-LABEL: masked_gather_v16i16: 263; CHECK: // %bb.0: 264; CHECK-NEXT: ptrue p0.h, vl16 265; CHECK-NEXT: ptrue p1.d, vl16 266; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 267; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 268; CHECK-NEXT: ld1d { z0.d }, p1/z, [x1] 269; CHECK-NEXT: punpklo p0.h, p0.b 270; CHECK-NEXT: punpklo p0.h, p0.b 271; CHECK-NEXT: ld1h { z0.d }, p0/z, [z0.d] 272; CHECK-NEXT: st1h { z0.d }, p1, [x0] 273; CHECK-NEXT: ret 274 %cval = load <16 x i16>, ptr %a 275 %ptrs = load <16 x ptr>, ptr %b 276 %mask = icmp eq <16 x i16> %cval, zeroinitializer 277 %vals = call <16 x i16> @llvm.masked.gather.v16i16(<16 x ptr> %ptrs, i32 8, <16 x i1> %mask, <16 x i16> undef) 278 store <16 x i16> %vals, ptr %a 279 ret void 280} 281 282define void @masked_gather_v32i16(ptr %a, ptr %b) vscale_range(16,0) #0 { 283; CHECK-LABEL: masked_gather_v32i16: 284; CHECK: // %bb.0: 285; CHECK-NEXT: ptrue p0.h, vl32 286; CHECK-NEXT: ptrue p1.d, vl32 287; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 288; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 289; CHECK-NEXT: ld1d { z0.d }, p1/z, [x1] 290; CHECK-NEXT: punpklo p0.h, p0.b 291; CHECK-NEXT: punpklo p0.h, p0.b 292; CHECK-NEXT: ld1h { z0.d }, p0/z, [z0.d] 293; CHECK-NEXT: st1h { z0.d }, p1, [x0] 294; CHECK-NEXT: ret 295 %cval = load <32 x i16>, ptr %a 296 %ptrs = load <32 x ptr>, ptr %b 297 %mask = icmp eq <32 x i16> %cval, zeroinitializer 298 %vals = call <32 x i16> @llvm.masked.gather.v32i16(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x i16> undef) 299 store <32 x i16> %vals, ptr %a 300 ret void 301} 302 303; 304; LD1W 305; 306 307define void @masked_gather_v2i32(ptr %a, ptr %b) vscale_range(2,0) #0 { 308; CHECK-LABEL: masked_gather_v2i32: 309; CHECK: // %bb.0: 310; CHECK-NEXT: ldr d0, [x0] 311; CHECK-NEXT: ptrue p0.d, vl2 312; CHECK-NEXT: cmeq v0.2s, v0.2s, #0 313; CHECK-NEXT: sshll v0.2d, v0.2s, #0 314; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 315; CHECK-NEXT: ldr q0, [x1] 316; CHECK-NEXT: ld1w { z0.d }, p0/z, [z0.d] 317; CHECK-NEXT: xtn v0.2s, v0.2d 318; CHECK-NEXT: str d0, [x0] 319; CHECK-NEXT: ret 320 %cval = load <2 x i32>, ptr %a 321 %ptrs = load <2 x ptr>, ptr %b 322 %mask = icmp eq <2 x i32> %cval, zeroinitializer 323 %vals = call <2 x i32> @llvm.masked.gather.v2i32(<2 x ptr> %ptrs, i32 8, <2 x i1> %mask, <2 x i32> undef) 324 store <2 x i32> %vals, ptr %a 325 ret void 326} 327 328define void @masked_gather_v4i32(ptr %a, ptr %b) vscale_range(2,0) #0 { 329; CHECK-LABEL: masked_gather_v4i32: 330; CHECK: // %bb.0: 331; CHECK-NEXT: ldr q0, [x0] 332; CHECK-NEXT: ptrue p0.d, vl4 333; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 334; CHECK-NEXT: sunpklo z0.d, z0.s 335; CHECK-NEXT: cmpne p1.d, p0/z, z0.d, #0 336; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] 337; CHECK-NEXT: ld1w { z0.d }, p1/z, [z0.d] 338; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s 339; CHECK-NEXT: str q0, [x0] 340; CHECK-NEXT: ret 341 %cval = load <4 x i32>, ptr %a 342 %ptrs = load <4 x ptr>, ptr %b 343 %mask = icmp eq <4 x i32> %cval, zeroinitializer 344 %vals = call <4 x i32> @llvm.masked.gather.v4i32(<4 x ptr> %ptrs, i32 8, <4 x i1> %mask, <4 x i32> undef) 345 store <4 x i32> %vals, ptr %a 346 ret void 347} 348 349define void @masked_gather_v8i32(ptr %a, ptr %b) #0 { 350; VBITS_GE_256-LABEL: masked_gather_v8i32: 351; VBITS_GE_256: // %bb.0: 352; VBITS_GE_256-NEXT: ptrue p0.s, vl8 353; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 354; VBITS_GE_256-NEXT: ptrue p2.d, vl4 355; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] 356; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [x1] 357; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0 358; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff 359; VBITS_GE_256-NEXT: punpklo p1.h, p1.b 360; VBITS_GE_256-NEXT: and p1.b, p1/z, p1.b, p2.b 361; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 362; VBITS_GE_256-NEXT: ld1w { z1.d }, p1/z, [z1.d] 363; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s 364; VBITS_GE_256-NEXT: cmpne p1.d, p2/z, z0.d, #0 365; VBITS_GE_256-NEXT: ld1d { z0.d }, p2/z, [x1, x8, lsl #3] 366; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s 367; VBITS_GE_256-NEXT: ld1w { z0.d }, p1/z, [z0.d] 368; VBITS_GE_256-NEXT: ptrue p1.s, vl4 369; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s 370; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s 371; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] 372; VBITS_GE_256-NEXT: ret 373; 374; VBITS_GE_512-LABEL: masked_gather_v8i32: 375; VBITS_GE_512: // %bb.0: 376; VBITS_GE_512-NEXT: ptrue p0.s, vl8 377; VBITS_GE_512-NEXT: ptrue p1.d, vl8 378; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 379; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, #0 380; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [x1] 381; VBITS_GE_512-NEXT: punpklo p0.h, p0.b 382; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [z0.d] 383; VBITS_GE_512-NEXT: st1w { z0.d }, p1, [x0] 384; VBITS_GE_512-NEXT: ret 385 %cval = load <8 x i32>, ptr %a 386 %ptrs = load <8 x ptr>, ptr %b 387 %mask = icmp eq <8 x i32> %cval, zeroinitializer 388 %vals = call <8 x i32> @llvm.masked.gather.v8i32(<8 x ptr> %ptrs, i32 8, <8 x i1> %mask, <8 x i32> undef) 389 store <8 x i32> %vals, ptr %a 390 ret void 391} 392 393define void @masked_gather_v16i32(ptr %a, ptr %b) vscale_range(8,0) #0 { 394; CHECK-LABEL: masked_gather_v16i32: 395; CHECK: // %bb.0: 396; CHECK-NEXT: ptrue p0.s, vl16 397; CHECK-NEXT: ptrue p1.d, vl16 398; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 399; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0 400; CHECK-NEXT: ld1d { z0.d }, p1/z, [x1] 401; CHECK-NEXT: punpklo p0.h, p0.b 402; CHECK-NEXT: ld1w { z0.d }, p0/z, [z0.d] 403; CHECK-NEXT: st1w { z0.d }, p1, [x0] 404; CHECK-NEXT: ret 405 %cval = load <16 x i32>, ptr %a 406 %ptrs = load <16 x ptr>, ptr %b 407 %mask = icmp eq <16 x i32> %cval, zeroinitializer 408 %vals = call <16 x i32> @llvm.masked.gather.v16i32(<16 x ptr> %ptrs, i32 8, <16 x i1> %mask, <16 x i32> undef) 409 store <16 x i32> %vals, ptr %a 410 ret void 411} 412 413define void @masked_gather_v32i32(ptr %a, ptr %b) vscale_range(16,0) #0 { 414; CHECK-LABEL: masked_gather_v32i32: 415; CHECK: // %bb.0: 416; CHECK-NEXT: ptrue p0.s, vl32 417; CHECK-NEXT: ptrue p1.d, vl32 418; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 419; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0 420; CHECK-NEXT: ld1d { z0.d }, p1/z, [x1] 421; CHECK-NEXT: punpklo p0.h, p0.b 422; CHECK-NEXT: ld1w { z0.d }, p0/z, [z0.d] 423; CHECK-NEXT: st1w { z0.d }, p1, [x0] 424; CHECK-NEXT: ret 425 %cval = load <32 x i32>, ptr %a 426 %ptrs = load <32 x ptr>, ptr %b 427 %mask = icmp eq <32 x i32> %cval, zeroinitializer 428 %vals = call <32 x i32> @llvm.masked.gather.v32i32(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x i32> undef) 429 store <32 x i32> %vals, ptr %a 430 ret void 431} 432 433; 434; LD1D 435; 436 437; Scalarize 1 x i64 gathers 438define void @masked_gather_v1i64(ptr %a, ptr %b) vscale_range(2,0) #0 { 439; CHECK-LABEL: masked_gather_v1i64: 440; CHECK: // %bb.0: 441; CHECK-NEXT: ldr d0, [x0] 442; CHECK-NEXT: fmov x8, d0 443; CHECK-NEXT: // implicit-def: $d0 444; CHECK-NEXT: cbnz x8, .LBB15_2 445; CHECK-NEXT: // %bb.1: // %cond.load 446; CHECK-NEXT: ldr d0, [x1] 447; CHECK-NEXT: fmov x8, d0 448; CHECK-NEXT: ldr d0, [x8] 449; CHECK-NEXT: .LBB15_2: // %else 450; CHECK-NEXT: str d0, [x0] 451; CHECK-NEXT: ret 452 %cval = load <1 x i64>, ptr %a 453 %ptrs = load <1 x ptr>, ptr %b 454 %mask = icmp eq <1 x i64> %cval, zeroinitializer 455 %vals = call <1 x i64> @llvm.masked.gather.v1i64(<1 x ptr> %ptrs, i32 8, <1 x i1> %mask, <1 x i64> undef) 456 store <1 x i64> %vals, ptr %a 457 ret void 458} 459 460define void @masked_gather_v2i64(ptr %a, ptr %b) vscale_range(2,0) #0 { 461; CHECK-LABEL: masked_gather_v2i64: 462; CHECK: // %bb.0: 463; CHECK-NEXT: ldr q0, [x0] 464; CHECK-NEXT: ptrue p0.d, vl2 465; CHECK-NEXT: cmeq v0.2d, v0.2d, #0 466; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 467; CHECK-NEXT: ldr q0, [x1] 468; CHECK-NEXT: ld1d { z0.d }, p0/z, [z0.d] 469; CHECK-NEXT: str q0, [x0] 470; CHECK-NEXT: ret 471 %cval = load <2 x i64>, ptr %a 472 %ptrs = load <2 x ptr>, ptr %b 473 %mask = icmp eq <2 x i64> %cval, zeroinitializer 474 %vals = call <2 x i64> @llvm.masked.gather.v2i64(<2 x ptr> %ptrs, i32 8, <2 x i1> %mask, <2 x i64> undef) 475 store <2 x i64> %vals, ptr %a 476 ret void 477} 478 479define void @masked_gather_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { 480; CHECK-LABEL: masked_gather_v4i64: 481; CHECK: // %bb.0: 482; CHECK-NEXT: ptrue p0.d, vl4 483; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 484; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 485; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] 486; CHECK-NEXT: ld1d { z0.d }, p1/z, [z0.d] 487; CHECK-NEXT: st1d { z0.d }, p0, [x0] 488; CHECK-NEXT: ret 489 %cval = load <4 x i64>, ptr %a 490 %ptrs = load <4 x ptr>, ptr %b 491 %mask = icmp eq <4 x i64> %cval, zeroinitializer 492 %vals = call <4 x i64> @llvm.masked.gather.v4i64(<4 x ptr> %ptrs, i32 8, <4 x i1> %mask, <4 x i64> undef) 493 store <4 x i64> %vals, ptr %a 494 ret void 495} 496 497define void @masked_gather_v8i64(ptr %a, ptr %b) #0 { 498; VBITS_GE_256-LABEL: masked_gather_v8i64: 499; VBITS_GE_256: // %bb.0: 500; VBITS_GE_256-NEXT: ptrue p0.d, vl4 501; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 502; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 503; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 504; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0 505; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] 506; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [z0.d] 507; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z1.d, #0 508; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] 509; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [z1.d] 510; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 511; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] 512; VBITS_GE_256-NEXT: ret 513; 514; VBITS_GE_512-LABEL: masked_gather_v8i64: 515; VBITS_GE_512: // %bb.0: 516; VBITS_GE_512-NEXT: ptrue p0.d, vl8 517; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 518; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 519; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] 520; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [z0.d] 521; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 522; VBITS_GE_512-NEXT: ret 523 %cval = load <8 x i64>, ptr %a 524 %ptrs = load <8 x ptr>, ptr %b 525 %mask = icmp eq <8 x i64> %cval, zeroinitializer 526 %vals = call <8 x i64> @llvm.masked.gather.v8i64(<8 x ptr> %ptrs, i32 8, <8 x i1> %mask, <8 x i64> undef) 527 store <8 x i64> %vals, ptr %a 528 ret void 529} 530 531define void @masked_gather_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { 532; CHECK-LABEL: masked_gather_v16i64: 533; CHECK: // %bb.0: 534; CHECK-NEXT: ptrue p0.d, vl16 535; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 536; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 537; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] 538; CHECK-NEXT: ld1d { z0.d }, p1/z, [z0.d] 539; CHECK-NEXT: st1d { z0.d }, p0, [x0] 540; CHECK-NEXT: ret 541 %cval = load <16 x i64>, ptr %a 542 %ptrs = load <16 x ptr>, ptr %b 543 %mask = icmp eq <16 x i64> %cval, zeroinitializer 544 %vals = call <16 x i64> @llvm.masked.gather.v16i64(<16 x ptr> %ptrs, i32 8, <16 x i1> %mask, <16 x i64> undef) 545 store <16 x i64> %vals, ptr %a 546 ret void 547} 548 549define void @masked_gather_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { 550; CHECK-LABEL: masked_gather_v32i64: 551; CHECK: // %bb.0: 552; CHECK-NEXT: ptrue p0.d, vl32 553; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 554; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 555; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] 556; CHECK-NEXT: ld1d { z0.d }, p1/z, [z0.d] 557; CHECK-NEXT: st1d { z0.d }, p0, [x0] 558; CHECK-NEXT: ret 559 %cval = load <32 x i64>, ptr %a 560 %ptrs = load <32 x ptr>, ptr %b 561 %mask = icmp eq <32 x i64> %cval, zeroinitializer 562 %vals = call <32 x i64> @llvm.masked.gather.v32i64(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x i64> undef) 563 store <32 x i64> %vals, ptr %a 564 ret void 565} 566 567; 568; LD1H (float) 569; 570 571define void @masked_gather_v2f16(ptr %a, ptr %b) vscale_range(2,0) #0 { 572; CHECK-LABEL: masked_gather_v2f16: 573; CHECK: // %bb.0: 574; CHECK-NEXT: ldr s1, [x0] 575; CHECK-NEXT: movi v0.2d, #0000000000000000 576; CHECK-NEXT: ptrue p0.d, vl4 577; CHECK-NEXT: fcmeq v1.4h, v1.4h, #0.0 578; CHECK-NEXT: sshll v1.4s, v1.4h, #0 579; CHECK-NEXT: mov v0.h[0], v1.h[0] 580; CHECK-NEXT: mov w8, v1.s[1] 581; CHECK-NEXT: mov v0.h[1], w8 582; CHECK-NEXT: sunpklo z0.s, z0.h 583; CHECK-NEXT: sunpklo z0.d, z0.s 584; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 585; CHECK-NEXT: ldr q0, [x1] 586; CHECK-NEXT: ld1h { z0.d }, p0/z, [z0.d] 587; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s 588; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h 589; CHECK-NEXT: str s0, [x0] 590; CHECK-NEXT: ret 591 %cval = load <2 x half>, ptr %a 592 %ptrs = load <2 x ptr>, ptr %b 593 %mask = fcmp oeq <2 x half> %cval, zeroinitializer 594 %vals = call <2 x half> @llvm.masked.gather.v2f16(<2 x ptr> %ptrs, i32 8, <2 x i1> %mask, <2 x half> undef) 595 store <2 x half> %vals, ptr %a 596 ret void 597} 598 599define void @masked_gather_v4f16(ptr %a, ptr %b) vscale_range(2,0) #0 { 600; CHECK-LABEL: masked_gather_v4f16: 601; CHECK: // %bb.0: 602; CHECK-NEXT: ldr d0, [x0] 603; CHECK-NEXT: ptrue p0.d, vl4 604; CHECK-NEXT: fcmeq v0.4h, v0.4h, #0.0 605; CHECK-NEXT: sunpklo z0.s, z0.h 606; CHECK-NEXT: sunpklo z0.d, z0.s 607; CHECK-NEXT: cmpne p1.d, p0/z, z0.d, #0 608; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] 609; CHECK-NEXT: ld1h { z0.d }, p1/z, [z0.d] 610; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s 611; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h 612; CHECK-NEXT: str d0, [x0] 613; CHECK-NEXT: ret 614 %cval = load <4 x half>, ptr %a 615 %ptrs = load <4 x ptr>, ptr %b 616 %mask = fcmp oeq <4 x half> %cval, zeroinitializer 617 %vals = call <4 x half> @llvm.masked.gather.v4f16(<4 x ptr> %ptrs, i32 8, <4 x i1> %mask, <4 x half> undef) 618 store <4 x half> %vals, ptr %a 619 ret void 620} 621 622define void @masked_gather_v8f16(ptr %a, ptr %b) #0 { 623; VBITS_GE_256-LABEL: masked_gather_v8f16: 624; VBITS_GE_256: // %bb.0: 625; VBITS_GE_256-NEXT: ldr q0, [x0] 626; VBITS_GE_256-NEXT: ptrue p0.d, vl4 627; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 628; VBITS_GE_256-NEXT: fcmeq v0.8h, v0.8h, #0.0 629; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h 630; VBITS_GE_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8 631; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s 632; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h 633; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z1.d, #0 634; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s 635; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] 636; VBITS_GE_256-NEXT: ld1h { z1.d }, p1/z, [z1.d] 637; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z0.d, #0 638; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] 639; VBITS_GE_256-NEXT: ld1h { z0.d }, p1/z, [z0.d] 640; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s 641; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h 642; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s 643; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h 644; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0] 645; VBITS_GE_256-NEXT: str q1, [x0] 646; VBITS_GE_256-NEXT: ret 647; 648; VBITS_GE_512-LABEL: masked_gather_v8f16: 649; VBITS_GE_512: // %bb.0: 650; VBITS_GE_512-NEXT: ldr q0, [x0] 651; VBITS_GE_512-NEXT: ptrue p0.d, vl8 652; VBITS_GE_512-NEXT: fcmeq v0.8h, v0.8h, #0.0 653; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h 654; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s 655; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 656; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] 657; VBITS_GE_512-NEXT: ld1h { z0.d }, p1/z, [z0.d] 658; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s 659; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h 660; VBITS_GE_512-NEXT: str q0, [x0] 661; VBITS_GE_512-NEXT: ret 662 %cval = load <8 x half>, ptr %a 663 %ptrs = load <8 x ptr>, ptr %b 664 %mask = fcmp oeq <8 x half> %cval, zeroinitializer 665 %vals = call <8 x half> @llvm.masked.gather.v8f16(<8 x ptr> %ptrs, i32 8, <8 x i1> %mask, <8 x half> undef) 666 store <8 x half> %vals, ptr %a 667 ret void 668} 669 670define void @masked_gather_v16f16(ptr %a, ptr %b) vscale_range(8,0) #0 { 671; CHECK-LABEL: masked_gather_v16f16: 672; CHECK: // %bb.0: 673; CHECK-NEXT: ptrue p0.h, vl16 674; CHECK-NEXT: ptrue p1.d, vl16 675; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 676; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 677; CHECK-NEXT: ld1d { z0.d }, p1/z, [x1] 678; CHECK-NEXT: punpklo p0.h, p0.b 679; CHECK-NEXT: punpklo p0.h, p0.b 680; CHECK-NEXT: ld1h { z0.d }, p0/z, [z0.d] 681; CHECK-NEXT: st1h { z0.d }, p1, [x0] 682; CHECK-NEXT: ret 683 %cval = load <16 x half>, ptr %a 684 %ptrs = load <16 x ptr>, ptr %b 685 %mask = fcmp oeq <16 x half> %cval, zeroinitializer 686 %vals = call <16 x half> @llvm.masked.gather.v16f16(<16 x ptr> %ptrs, i32 8, <16 x i1> %mask, <16 x half> undef) 687 store <16 x half> %vals, ptr %a 688 ret void 689} 690 691define void @masked_gather_v32f16(ptr %a, ptr %b) vscale_range(16,0) #0 { 692; CHECK-LABEL: masked_gather_v32f16: 693; CHECK: // %bb.0: 694; CHECK-NEXT: ptrue p0.h, vl32 695; CHECK-NEXT: ptrue p1.d, vl32 696; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 697; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 698; CHECK-NEXT: ld1d { z0.d }, p1/z, [x1] 699; CHECK-NEXT: punpklo p0.h, p0.b 700; CHECK-NEXT: punpklo p0.h, p0.b 701; CHECK-NEXT: ld1h { z0.d }, p0/z, [z0.d] 702; CHECK-NEXT: st1h { z0.d }, p1, [x0] 703; CHECK-NEXT: ret 704 %cval = load <32 x half>, ptr %a 705 %ptrs = load <32 x ptr>, ptr %b 706 %mask = fcmp oeq <32 x half> %cval, zeroinitializer 707 %vals = call <32 x half> @llvm.masked.gather.v32f16(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x half> undef) 708 store <32 x half> %vals, ptr %a 709 ret void 710} 711 712; 713; LD1W (float) 714; 715 716define void @masked_gather_v2f32(ptr %a, ptr %b) vscale_range(2,0) #0 { 717; CHECK-LABEL: masked_gather_v2f32: 718; CHECK: // %bb.0: 719; CHECK-NEXT: ldr d0, [x0] 720; CHECK-NEXT: ptrue p0.d, vl2 721; CHECK-NEXT: fcmeq v0.2s, v0.2s, #0.0 722; CHECK-NEXT: sshll v0.2d, v0.2s, #0 723; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 724; CHECK-NEXT: ldr q0, [x1] 725; CHECK-NEXT: ld1w { z0.d }, p0/z, [z0.d] 726; CHECK-NEXT: xtn v0.2s, v0.2d 727; CHECK-NEXT: str d0, [x0] 728; CHECK-NEXT: ret 729 %cval = load <2 x float>, ptr %a 730 %ptrs = load <2 x ptr>, ptr %b 731 %mask = fcmp oeq <2 x float> %cval, zeroinitializer 732 %vals = call <2 x float> @llvm.masked.gather.v2f32(<2 x ptr> %ptrs, i32 8, <2 x i1> %mask, <2 x float> undef) 733 store <2 x float> %vals, ptr %a 734 ret void 735} 736 737define void @masked_gather_v4f32(ptr %a, ptr %b) vscale_range(2,0) #0 { 738; CHECK-LABEL: masked_gather_v4f32: 739; CHECK: // %bb.0: 740; CHECK-NEXT: ldr q0, [x0] 741; CHECK-NEXT: ptrue p0.d, vl4 742; CHECK-NEXT: fcmeq v0.4s, v0.4s, #0.0 743; CHECK-NEXT: sunpklo z0.d, z0.s 744; CHECK-NEXT: cmpne p1.d, p0/z, z0.d, #0 745; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] 746; CHECK-NEXT: ld1w { z0.d }, p1/z, [z0.d] 747; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s 748; CHECK-NEXT: str q0, [x0] 749; CHECK-NEXT: ret 750 %cval = load <4 x float>, ptr %a 751 %ptrs = load <4 x ptr>, ptr %b 752 %mask = fcmp oeq <4 x float> %cval, zeroinitializer 753 %vals = call <4 x float> @llvm.masked.gather.v4f32(<4 x ptr> %ptrs, i32 8, <4 x i1> %mask, <4 x float> undef) 754 store <4 x float> %vals, ptr %a 755 ret void 756} 757 758define void @masked_gather_v8f32(ptr %a, ptr %b) #0 { 759; VBITS_GE_256-LABEL: masked_gather_v8f32: 760; VBITS_GE_256: // %bb.0: 761; VBITS_GE_256-NEXT: ptrue p0.s, vl8 762; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 763; VBITS_GE_256-NEXT: ptrue p2.d, vl4 764; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] 765; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [x1] 766; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 767; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff 768; VBITS_GE_256-NEXT: punpklo p1.h, p1.b 769; VBITS_GE_256-NEXT: and p1.b, p1/z, p1.b, p2.b 770; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 771; VBITS_GE_256-NEXT: ld1w { z1.d }, p1/z, [z1.d] 772; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s 773; VBITS_GE_256-NEXT: cmpne p1.d, p2/z, z0.d, #0 774; VBITS_GE_256-NEXT: ld1d { z0.d }, p2/z, [x1, x8, lsl #3] 775; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s 776; VBITS_GE_256-NEXT: ld1w { z0.d }, p1/z, [z0.d] 777; VBITS_GE_256-NEXT: ptrue p1.s, vl4 778; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s 779; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s 780; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] 781; VBITS_GE_256-NEXT: ret 782; 783; VBITS_GE_512-LABEL: masked_gather_v8f32: 784; VBITS_GE_512: // %bb.0: 785; VBITS_GE_512-NEXT: ptrue p0.s, vl8 786; VBITS_GE_512-NEXT: ptrue p1.d, vl8 787; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 788; VBITS_GE_512-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 789; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [x1] 790; VBITS_GE_512-NEXT: punpklo p0.h, p0.b 791; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [z0.d] 792; VBITS_GE_512-NEXT: st1w { z0.d }, p1, [x0] 793; VBITS_GE_512-NEXT: ret 794 %cval = load <8 x float>, ptr %a 795 %ptrs = load <8 x ptr>, ptr %b 796 %mask = fcmp oeq <8 x float> %cval, zeroinitializer 797 %vals = call <8 x float> @llvm.masked.gather.v8f32(<8 x ptr> %ptrs, i32 8, <8 x i1> %mask, <8 x float> undef) 798 store <8 x float> %vals, ptr %a 799 ret void 800} 801 802define void @masked_gather_v16f32(ptr %a, ptr %b) vscale_range(8,0) #0 { 803; CHECK-LABEL: masked_gather_v16f32: 804; CHECK: // %bb.0: 805; CHECK-NEXT: ptrue p0.s, vl16 806; CHECK-NEXT: ptrue p1.d, vl16 807; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 808; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 809; CHECK-NEXT: ld1d { z0.d }, p1/z, [x1] 810; CHECK-NEXT: punpklo p0.h, p0.b 811; CHECK-NEXT: ld1w { z0.d }, p0/z, [z0.d] 812; CHECK-NEXT: st1w { z0.d }, p1, [x0] 813; CHECK-NEXT: ret 814 %cval = load <16 x float>, ptr %a 815 %ptrs = load <16 x ptr>, ptr %b 816 %mask = fcmp oeq <16 x float> %cval, zeroinitializer 817 %vals = call <16 x float> @llvm.masked.gather.v16f32(<16 x ptr> %ptrs, i32 8, <16 x i1> %mask, <16 x float> undef) 818 store <16 x float> %vals, ptr %a 819 ret void 820} 821 822define void @masked_gather_v32f32(ptr %a, ptr %b) vscale_range(16,0) #0 { 823; CHECK-LABEL: masked_gather_v32f32: 824; CHECK: // %bb.0: 825; CHECK-NEXT: ptrue p0.s, vl32 826; CHECK-NEXT: ptrue p1.d, vl32 827; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 828; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 829; CHECK-NEXT: ld1d { z0.d }, p1/z, [x1] 830; CHECK-NEXT: punpklo p0.h, p0.b 831; CHECK-NEXT: ld1w { z0.d }, p0/z, [z0.d] 832; CHECK-NEXT: st1w { z0.d }, p1, [x0] 833; CHECK-NEXT: ret 834 %cval = load <32 x float>, ptr %a 835 %ptrs = load <32 x ptr>, ptr %b 836 %mask = fcmp oeq <32 x float> %cval, zeroinitializer 837 %vals = call <32 x float> @llvm.masked.gather.v32f32(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x float> undef) 838 store <32 x float> %vals, ptr %a 839 ret void 840} 841 842; 843; LD1D (float) 844; 845 846; Scalarize 1 x double gathers 847define void @masked_gather_v1f64(ptr %a, ptr %b) vscale_range(2,0) #0 { 848; CHECK-LABEL: masked_gather_v1f64: 849; CHECK: // %bb.0: 850; CHECK-NEXT: ldr d0, [x0] 851; CHECK-NEXT: fcmp d0, #0.0 852; CHECK-NEXT: // implicit-def: $d0 853; CHECK-NEXT: b.ne .LBB31_2 854; CHECK-NEXT: // %bb.1: // %cond.load 855; CHECK-NEXT: ldr d0, [x1] 856; CHECK-NEXT: fmov x8, d0 857; CHECK-NEXT: ldr d0, [x8] 858; CHECK-NEXT: .LBB31_2: // %else 859; CHECK-NEXT: str d0, [x0] 860; CHECK-NEXT: ret 861 %cval = load <1 x double>, ptr %a 862 %ptrs = load <1 x ptr>, ptr %b 863 %mask = fcmp oeq <1 x double> %cval, zeroinitializer 864 %vals = call <1 x double> @llvm.masked.gather.v1f64(<1 x ptr> %ptrs, i32 8, <1 x i1> %mask, <1 x double> undef) 865 store <1 x double> %vals, ptr %a 866 ret void 867} 868 869define void @masked_gather_v2f64(ptr %a, ptr %b) vscale_range(2,0) #0 { 870; CHECK-LABEL: masked_gather_v2f64: 871; CHECK: // %bb.0: 872; CHECK-NEXT: ldr q0, [x0] 873; CHECK-NEXT: ptrue p0.d, vl2 874; CHECK-NEXT: fcmeq v0.2d, v0.2d, #0.0 875; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 876; CHECK-NEXT: ldr q0, [x1] 877; CHECK-NEXT: ld1d { z0.d }, p0/z, [z0.d] 878; CHECK-NEXT: str q0, [x0] 879; CHECK-NEXT: ret 880 %cval = load <2 x double>, ptr %a 881 %ptrs = load <2 x ptr>, ptr %b 882 %mask = fcmp oeq <2 x double> %cval, zeroinitializer 883 %vals = call <2 x double> @llvm.masked.gather.v2f64(<2 x ptr> %ptrs, i32 8, <2 x i1> %mask, <2 x double> undef) 884 store <2 x double> %vals, ptr %a 885 ret void 886} 887 888define void @masked_gather_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { 889; CHECK-LABEL: masked_gather_v4f64: 890; CHECK: // %bb.0: 891; CHECK-NEXT: ptrue p0.d, vl4 892; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 893; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 894; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] 895; CHECK-NEXT: ld1d { z0.d }, p1/z, [z0.d] 896; CHECK-NEXT: st1d { z0.d }, p0, [x0] 897; CHECK-NEXT: ret 898 %cval = load <4 x double>, ptr %a 899 %ptrs = load <4 x ptr>, ptr %b 900 %mask = fcmp oeq <4 x double> %cval, zeroinitializer 901 %vals = call <4 x double> @llvm.masked.gather.v4f64(<4 x ptr> %ptrs, i32 8, <4 x i1> %mask, <4 x double> undef) 902 store <4 x double> %vals, ptr %a 903 ret void 904} 905 906define void @masked_gather_v8f64(ptr %a, ptr %b) #0 { 907; VBITS_GE_256-LABEL: masked_gather_v8f64: 908; VBITS_GE_256: // %bb.0: 909; VBITS_GE_256-NEXT: ptrue p0.d, vl4 910; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 911; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 912; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 913; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 914; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] 915; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [z0.d] 916; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z1.d, #0.0 917; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] 918; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [z1.d] 919; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 920; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] 921; VBITS_GE_256-NEXT: ret 922; 923; VBITS_GE_512-LABEL: masked_gather_v8f64: 924; VBITS_GE_512: // %bb.0: 925; VBITS_GE_512-NEXT: ptrue p0.d, vl8 926; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 927; VBITS_GE_512-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 928; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] 929; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [z0.d] 930; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 931; VBITS_GE_512-NEXT: ret 932 %cval = load <8 x double>, ptr %a 933 %ptrs = load <8 x ptr>, ptr %b 934 %mask = fcmp oeq <8 x double> %cval, zeroinitializer 935 %vals = call <8 x double> @llvm.masked.gather.v8f64(<8 x ptr> %ptrs, i32 8, <8 x i1> %mask, <8 x double> undef) 936 store <8 x double> %vals, ptr %a 937 ret void 938} 939 940define void @masked_gather_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 { 941; CHECK-LABEL: masked_gather_v16f64: 942; CHECK: // %bb.0: 943; CHECK-NEXT: ptrue p0.d, vl16 944; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 945; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 946; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] 947; CHECK-NEXT: ld1d { z0.d }, p1/z, [z0.d] 948; CHECK-NEXT: st1d { z0.d }, p0, [x0] 949; CHECK-NEXT: ret 950 %cval = load <16 x double>, ptr %a 951 %ptrs = load <16 x ptr>, ptr %b 952 %mask = fcmp oeq <16 x double> %cval, zeroinitializer 953 %vals = call <16 x double> @llvm.masked.gather.v16f64(<16 x ptr> %ptrs, i32 8, <16 x i1> %mask, <16 x double> undef) 954 store <16 x double> %vals, ptr %a 955 ret void 956} 957 958define void @masked_gather_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 { 959; CHECK-LABEL: masked_gather_v32f64: 960; CHECK: // %bb.0: 961; CHECK-NEXT: ptrue p0.d, vl32 962; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 963; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 964; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] 965; CHECK-NEXT: ld1d { z0.d }, p1/z, [z0.d] 966; CHECK-NEXT: st1d { z0.d }, p0, [x0] 967; CHECK-NEXT: ret 968 %cval = load <32 x double>, ptr %a 969 %ptrs = load <32 x ptr>, ptr %b 970 %mask = fcmp oeq <32 x double> %cval, zeroinitializer 971 %vals = call <32 x double> @llvm.masked.gather.v32f64(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x double> undef) 972 store <32 x double> %vals, ptr %a 973 ret void 974} 975 976; The above tests test the types, the below tests check that the addressing 977; modes still function 978 979define void @masked_gather_32b_scaled_sext_f16(ptr %a, ptr %b, ptr %base) vscale_range(8,0) #0 { 980; CHECK-LABEL: masked_gather_32b_scaled_sext_f16: 981; CHECK: // %bb.0: 982; CHECK-NEXT: ptrue p0.h, vl32 983; CHECK-NEXT: ptrue p1.s, vl32 984; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 985; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 986; CHECK-NEXT: ld1w { z0.s }, p1/z, [x1] 987; CHECK-NEXT: punpklo p0.h, p0.b 988; CHECK-NEXT: ld1h { z0.s }, p0/z, [x2, z0.s, sxtw #1] 989; CHECK-NEXT: st1h { z0.s }, p1, [x0] 990; CHECK-NEXT: ret 991 %cvals = load <32 x half>, ptr %a 992 %idxs = load <32 x i32>, ptr %b 993 %ext = sext <32 x i32> %idxs to <32 x i64> 994 %ptrs = getelementptr half, ptr %base, <32 x i64> %ext 995 %mask = fcmp oeq <32 x half> %cvals, zeroinitializer 996 %vals = call <32 x half> @llvm.masked.gather.v32f16(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x half> undef) 997 store <32 x half> %vals, ptr %a 998 ret void 999} 1000 1001define void @masked_gather_32b_scaled_sext_f32(ptr %a, ptr %b, ptr %base) vscale_range(8,0) #0 { 1002; CHECK-LABEL: masked_gather_32b_scaled_sext_f32: 1003; CHECK: // %bb.0: 1004; CHECK-NEXT: ptrue p0.s, vl32 1005; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1006; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 1007; CHECK-NEXT: ld1w { z0.s }, p0/z, [x1] 1008; CHECK-NEXT: ld1w { z0.s }, p1/z, [x2, z0.s, sxtw #2] 1009; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1010; CHECK-NEXT: ret 1011 %cvals = load <32 x float>, ptr %a 1012 %idxs = load <32 x i32>, ptr %b 1013 %ext = sext <32 x i32> %idxs to <32 x i64> 1014 %ptrs = getelementptr float, ptr %base, <32 x i64> %ext 1015 %mask = fcmp oeq <32 x float> %cvals, zeroinitializer 1016 %vals = call <32 x float> @llvm.masked.gather.v32f32(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x float> undef) 1017 store <32 x float> %vals, ptr %a 1018 ret void 1019} 1020 1021define void @masked_gather_32b_scaled_sext_f64(ptr %a, ptr %b, ptr %base) vscale_range(16,0) #0 { 1022; CHECK-LABEL: masked_gather_32b_scaled_sext_f64: 1023; CHECK: // %bb.0: 1024; CHECK-NEXT: ptrue p0.d, vl32 1025; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1026; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 1027; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x1] 1028; CHECK-NEXT: ld1d { z0.d }, p1/z, [x2, z0.d, lsl #3] 1029; CHECK-NEXT: st1d { z0.d }, p0, [x0] 1030; CHECK-NEXT: ret 1031 %cvals = load <32 x double>, ptr %a 1032 %idxs = load <32 x i32>, ptr %b 1033 %ext = sext <32 x i32> %idxs to <32 x i64> 1034 %ptrs = getelementptr double, ptr %base, <32 x i64> %ext 1035 %mask = fcmp oeq <32 x double> %cvals, zeroinitializer 1036 %vals = call <32 x double> @llvm.masked.gather.v32f64(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x double> undef) 1037 store <32 x double> %vals, ptr %a 1038 ret void 1039} 1040 1041define void @masked_gather_32b_scaled_zext(ptr %a, ptr %b, ptr %base) vscale_range(8,0) #0 { 1042; CHECK-LABEL: masked_gather_32b_scaled_zext: 1043; CHECK: // %bb.0: 1044; CHECK-NEXT: ptrue p0.h, vl32 1045; CHECK-NEXT: ptrue p1.s, vl32 1046; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1047; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 1048; CHECK-NEXT: ld1w { z0.s }, p1/z, [x1] 1049; CHECK-NEXT: punpklo p0.h, p0.b 1050; CHECK-NEXT: ld1h { z0.s }, p0/z, [x2, z0.s, uxtw #1] 1051; CHECK-NEXT: st1h { z0.s }, p1, [x0] 1052; CHECK-NEXT: ret 1053 %cvals = load <32 x half>, ptr %a 1054 %idxs = load <32 x i32>, ptr %b 1055 %ext = zext <32 x i32> %idxs to <32 x i64> 1056 %ptrs = getelementptr half, ptr %base, <32 x i64> %ext 1057 %mask = fcmp oeq <32 x half> %cvals, zeroinitializer 1058 %vals = call <32 x half> @llvm.masked.gather.v32f16(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x half> undef) 1059 store <32 x half> %vals, ptr %a 1060 ret void 1061} 1062 1063define void @masked_gather_32b_unscaled_sext(ptr %a, ptr %b, ptr %base) vscale_range(8,0) #0 { 1064; CHECK-LABEL: masked_gather_32b_unscaled_sext: 1065; CHECK: // %bb.0: 1066; CHECK-NEXT: ptrue p0.h, vl32 1067; CHECK-NEXT: ptrue p1.s, vl32 1068; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1069; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 1070; CHECK-NEXT: ld1w { z0.s }, p1/z, [x1] 1071; CHECK-NEXT: punpklo p0.h, p0.b 1072; CHECK-NEXT: ld1h { z0.s }, p0/z, [x2, z0.s, sxtw] 1073; CHECK-NEXT: st1h { z0.s }, p1, [x0] 1074; CHECK-NEXT: ret 1075 %cvals = load <32 x half>, ptr %a 1076 %idxs = load <32 x i32>, ptr %b 1077 %ext = sext <32 x i32> %idxs to <32 x i64> 1078 %byte_ptrs = getelementptr i8, ptr %base, <32 x i64> %ext 1079 %ptrs = bitcast <32 x ptr> %byte_ptrs to <32 x ptr> 1080 %mask = fcmp oeq <32 x half> %cvals, zeroinitializer 1081 %vals = call <32 x half> @llvm.masked.gather.v32f16(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x half> undef) 1082 store <32 x half> %vals, ptr %a 1083 ret void 1084} 1085 1086define void @masked_gather_32b_unscaled_zext(ptr %a, ptr %b, ptr %base) vscale_range(8,0) #0 { 1087; CHECK-LABEL: masked_gather_32b_unscaled_zext: 1088; CHECK: // %bb.0: 1089; CHECK-NEXT: ptrue p0.h, vl32 1090; CHECK-NEXT: ptrue p1.s, vl32 1091; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1092; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 1093; CHECK-NEXT: ld1w { z0.s }, p1/z, [x1] 1094; CHECK-NEXT: punpklo p0.h, p0.b 1095; CHECK-NEXT: ld1h { z0.s }, p0/z, [x2, z0.s, uxtw] 1096; CHECK-NEXT: st1h { z0.s }, p1, [x0] 1097; CHECK-NEXT: ret 1098 %cvals = load <32 x half>, ptr %a 1099 %idxs = load <32 x i32>, ptr %b 1100 %ext = zext <32 x i32> %idxs to <32 x i64> 1101 %byte_ptrs = getelementptr i8, ptr %base, <32 x i64> %ext 1102 %ptrs = bitcast <32 x ptr> %byte_ptrs to <32 x ptr> 1103 %mask = fcmp oeq <32 x half> %cvals, zeroinitializer 1104 %vals = call <32 x half> @llvm.masked.gather.v32f16(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x half> undef) 1105 store <32 x half> %vals, ptr %a 1106 ret void 1107} 1108 1109define void @masked_gather_64b_scaled(ptr %a, ptr %b, ptr %base) vscale_range(16,0) #0 { 1110; CHECK-LABEL: masked_gather_64b_scaled: 1111; CHECK: // %bb.0: 1112; CHECK-NEXT: ptrue p0.s, vl32 1113; CHECK-NEXT: ptrue p1.d, vl32 1114; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1115; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 1116; CHECK-NEXT: ld1d { z0.d }, p1/z, [x1] 1117; CHECK-NEXT: punpklo p0.h, p0.b 1118; CHECK-NEXT: ld1w { z0.d }, p0/z, [x2, z0.d, lsl #2] 1119; CHECK-NEXT: st1w { z0.d }, p1, [x0] 1120; CHECK-NEXT: ret 1121 %cvals = load <32 x float>, ptr %a 1122 %idxs = load <32 x i64>, ptr %b 1123 %ptrs = getelementptr float, ptr %base, <32 x i64> %idxs 1124 %mask = fcmp oeq <32 x float> %cvals, zeroinitializer 1125 %vals = call <32 x float> @llvm.masked.gather.v32f32(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x float> undef) 1126 store <32 x float> %vals, ptr %a 1127 ret void 1128} 1129 1130define void @masked_gather_64b_unscaled(ptr %a, ptr %b, ptr %base) vscale_range(16,0) #0 { 1131; CHECK-LABEL: masked_gather_64b_unscaled: 1132; CHECK: // %bb.0: 1133; CHECK-NEXT: ptrue p0.s, vl32 1134; CHECK-NEXT: ptrue p1.d, vl32 1135; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1136; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 1137; CHECK-NEXT: ld1d { z0.d }, p1/z, [x1] 1138; CHECK-NEXT: punpklo p0.h, p0.b 1139; CHECK-NEXT: ld1w { z0.d }, p0/z, [x2, z0.d] 1140; CHECK-NEXT: st1w { z0.d }, p1, [x0] 1141; CHECK-NEXT: ret 1142 %cvals = load <32 x float>, ptr %a 1143 %idxs = load <32 x i64>, ptr %b 1144 %byte_ptrs = getelementptr i8, ptr %base, <32 x i64> %idxs 1145 %ptrs = bitcast <32 x ptr> %byte_ptrs to <32 x ptr> 1146 %mask = fcmp oeq <32 x float> %cvals, zeroinitializer 1147 %vals = call <32 x float> @llvm.masked.gather.v32f32(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x float> undef) 1148 store <32 x float> %vals, ptr %a 1149 ret void 1150} 1151 1152define void @masked_gather_vec_plus_reg(ptr %a, ptr %b, i64 %off) vscale_range(16,0) #0 { 1153; CHECK-LABEL: masked_gather_vec_plus_reg: 1154; CHECK: // %bb.0: 1155; CHECK-NEXT: ptrue p0.s, vl32 1156; CHECK-NEXT: ptrue p1.d, vl32 1157; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1158; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 1159; CHECK-NEXT: ld1d { z0.d }, p1/z, [x1] 1160; CHECK-NEXT: punpklo p0.h, p0.b 1161; CHECK-NEXT: ld1w { z0.d }, p0/z, [x2, z0.d] 1162; CHECK-NEXT: st1w { z0.d }, p1, [x0] 1163; CHECK-NEXT: ret 1164 %cvals = load <32 x float>, ptr %a 1165 %bases = load <32 x ptr>, ptr %b 1166 %byte_ptrs = getelementptr i8, <32 x ptr> %bases, i64 %off 1167 %ptrs = bitcast <32 x ptr> %byte_ptrs to <32 x ptr> 1168 %mask = fcmp oeq <32 x float> %cvals, zeroinitializer 1169 %vals = call <32 x float> @llvm.masked.gather.v32f32(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x float> undef) 1170 store <32 x float> %vals, ptr %a 1171 ret void 1172} 1173 1174define void @masked_gather_vec_plus_imm(ptr %a, ptr %b) vscale_range(16,0) #0 { 1175; CHECK-LABEL: masked_gather_vec_plus_imm: 1176; CHECK: // %bb.0: 1177; CHECK-NEXT: ptrue p0.s, vl32 1178; CHECK-NEXT: ptrue p1.d, vl32 1179; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1180; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 1181; CHECK-NEXT: ld1d { z0.d }, p1/z, [x1] 1182; CHECK-NEXT: punpklo p0.h, p0.b 1183; CHECK-NEXT: ld1w { z0.d }, p0/z, [z0.d, #4] 1184; CHECK-NEXT: st1w { z0.d }, p1, [x0] 1185; CHECK-NEXT: ret 1186 %cvals = load <32 x float>, ptr %a 1187 %bases = load <32 x ptr>, ptr %b 1188 %byte_ptrs = getelementptr i8, <32 x ptr> %bases, i64 4 1189 %ptrs = bitcast <32 x ptr> %byte_ptrs to <32 x ptr> 1190 %mask = fcmp oeq <32 x float> %cvals, zeroinitializer 1191 %vals = call <32 x float> @llvm.masked.gather.v32f32(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x float> undef) 1192 store <32 x float> %vals, ptr %a 1193 ret void 1194} 1195 1196define void @masked_gather_passthru(ptr %a, ptr %b, ptr %c) vscale_range(16,0) #0 { 1197; CHECK-LABEL: masked_gather_passthru: 1198; CHECK: // %bb.0: 1199; CHECK-NEXT: ptrue p0.s, vl32 1200; CHECK-NEXT: ptrue p2.d, vl32 1201; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1202; CHECK-NEXT: ld1w { z1.s }, p0/z, [x2] 1203; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 1204; CHECK-NEXT: ld1d { z0.d }, p2/z, [x1] 1205; CHECK-NEXT: punpklo p2.h, p1.b 1206; CHECK-NEXT: ld1w { z0.d }, p2/z, [z0.d] 1207; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s 1208; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s 1209; CHECK-NEXT: st1w { z0.s }, p0, [x0] 1210; CHECK-NEXT: ret 1211 %cvals = load <32 x float>, ptr %a 1212 %ptrs = load <32 x ptr>, ptr %b 1213 %passthru = load <32 x float>, ptr %c 1214 %mask = fcmp oeq <32 x float> %cvals, zeroinitializer 1215 %vals = call <32 x float> @llvm.masked.gather.v32f32(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x float> %passthru) 1216 store <32 x float> %vals, ptr %a 1217 ret void 1218} 1219 1220define void @masked_gather_passthru_0(ptr %a, ptr %b) vscale_range(16,0) #0 { 1221; CHECK-LABEL: masked_gather_passthru_0: 1222; CHECK: // %bb.0: 1223; CHECK-NEXT: ptrue p0.s, vl32 1224; CHECK-NEXT: ptrue p1.d, vl32 1225; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1226; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 1227; CHECK-NEXT: ld1d { z0.d }, p1/z, [x1] 1228; CHECK-NEXT: punpklo p0.h, p0.b 1229; CHECK-NEXT: ld1w { z0.d }, p0/z, [z0.d] 1230; CHECK-NEXT: st1w { z0.d }, p1, [x0] 1231; CHECK-NEXT: ret 1232 %cvals = load <32 x float>, ptr %a 1233 %ptrs = load <32 x ptr>, ptr %b 1234 %mask = fcmp oeq <32 x float> %cvals, zeroinitializer 1235 %vals = call <32 x float> @llvm.masked.gather.v32f32(<32 x ptr> %ptrs, i32 8, <32 x i1> %mask, <32 x float> zeroinitializer) 1236 store <32 x float> %vals, ptr %a 1237 ret void 1238} 1239 1240declare <2 x i8> @llvm.masked.gather.v2i8(<2 x ptr>, i32, <2 x i1>, <2 x i8>) 1241declare <4 x i8> @llvm.masked.gather.v4i8(<4 x ptr>, i32, <4 x i1>, <4 x i8>) 1242declare <8 x i8> @llvm.masked.gather.v8i8(<8 x ptr>, i32, <8 x i1>, <8 x i8>) 1243declare <16 x i8> @llvm.masked.gather.v16i8(<16 x ptr>, i32, <16 x i1>, <16 x i8>) 1244declare <32 x i8> @llvm.masked.gather.v32i8(<32 x ptr>, i32, <32 x i1>, <32 x i8>) 1245 1246declare <2 x i16> @llvm.masked.gather.v2i16(<2 x ptr>, i32, <2 x i1>, <2 x i16>) 1247declare <4 x i16> @llvm.masked.gather.v4i16(<4 x ptr>, i32, <4 x i1>, <4 x i16>) 1248declare <8 x i16> @llvm.masked.gather.v8i16(<8 x ptr>, i32, <8 x i1>, <8 x i16>) 1249declare <16 x i16> @llvm.masked.gather.v16i16(<16 x ptr>, i32, <16 x i1>, <16 x i16>) 1250declare <32 x i16> @llvm.masked.gather.v32i16(<32 x ptr>, i32, <32 x i1>, <32 x i16>) 1251 1252declare <2 x i32> @llvm.masked.gather.v2i32(<2 x ptr>, i32, <2 x i1>, <2 x i32>) 1253declare <4 x i32> @llvm.masked.gather.v4i32(<4 x ptr>, i32, <4 x i1>, <4 x i32>) 1254declare <8 x i32> @llvm.masked.gather.v8i32(<8 x ptr>, i32, <8 x i1>, <8 x i32>) 1255declare <16 x i32> @llvm.masked.gather.v16i32(<16 x ptr>, i32, <16 x i1>, <16 x i32>) 1256declare <32 x i32> @llvm.masked.gather.v32i32(<32 x ptr>, i32, <32 x i1>, <32 x i32>) 1257 1258declare <1 x i64> @llvm.masked.gather.v1i64(<1 x ptr>, i32, <1 x i1>, <1 x i64>) 1259declare <2 x i64> @llvm.masked.gather.v2i64(<2 x ptr>, i32, <2 x i1>, <2 x i64>) 1260declare <4 x i64> @llvm.masked.gather.v4i64(<4 x ptr>, i32, <4 x i1>, <4 x i64>) 1261declare <8 x i64> @llvm.masked.gather.v8i64(<8 x ptr>, i32, <8 x i1>, <8 x i64>) 1262declare <16 x i64> @llvm.masked.gather.v16i64(<16 x ptr>, i32, <16 x i1>, <16 x i64>) 1263declare <32 x i64> @llvm.masked.gather.v32i64(<32 x ptr>, i32, <32 x i1>, <32 x i64>) 1264 1265declare <2 x half> @llvm.masked.gather.v2f16(<2 x ptr>, i32, <2 x i1>, <2 x half>) 1266declare <4 x half> @llvm.masked.gather.v4f16(<4 x ptr>, i32, <4 x i1>, <4 x half>) 1267declare <8 x half> @llvm.masked.gather.v8f16(<8 x ptr>, i32, <8 x i1>, <8 x half>) 1268declare <16 x half> @llvm.masked.gather.v16f16(<16 x ptr>, i32, <16 x i1>, <16 x half>) 1269declare <32 x half> @llvm.masked.gather.v32f16(<32 x ptr>, i32, <32 x i1>, <32 x half>) 1270 1271declare <2 x float> @llvm.masked.gather.v2f32(<2 x ptr>, i32, <2 x i1>, <2 x float>) 1272declare <4 x float> @llvm.masked.gather.v4f32(<4 x ptr>, i32, <4 x i1>, <4 x float>) 1273declare <8 x float> @llvm.masked.gather.v8f32(<8 x ptr>, i32, <8 x i1>, <8 x float>) 1274declare <16 x float> @llvm.masked.gather.v16f32(<16 x ptr>, i32, <16 x i1>, <16 x float>) 1275declare <32 x float> @llvm.masked.gather.v32f32(<32 x ptr>, i32, <32 x i1>, <32 x float>) 1276 1277declare <1 x double> @llvm.masked.gather.v1f64(<1 x ptr>, i32, <1 x i1>, <1 x double>) 1278declare <2 x double> @llvm.masked.gather.v2f64(<2 x ptr>, i32, <2 x i1>, <2 x double>) 1279declare <4 x double> @llvm.masked.gather.v4f64(<4 x ptr>, i32, <4 x i1>, <4 x double>) 1280declare <8 x double> @llvm.masked.gather.v8f64(<8 x ptr>, i32, <8 x i1>, <8 x double>) 1281declare <16 x double> @llvm.masked.gather.v16f64(<16 x ptr>, i32, <16 x i1>, <16 x double>) 1282declare <32 x double> @llvm.masked.gather.v32f64(<32 x ptr>, i32, <32 x i1>, <32 x double>) 1283 1284attributes #0 = { "target-features"="+sve" } 1285