1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 3; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 5 6target triple = "aarch64-unknown-linux-gnu" 7 8; 9; ST1B 10; 11 12define void @masked_scatter_v2i8(ptr %a, ptr %b) vscale_range(2,0) #0 { 13; CHECK-LABEL: masked_scatter_v2i8: 14; CHECK: // %bb.0: 15; CHECK-NEXT: ldrb w8, [x0] 16; CHECK-NEXT: ldrb w9, [x0, #1] 17; CHECK-NEXT: ptrue p0.d, vl2 18; CHECK-NEXT: fmov s0, w8 19; CHECK-NEXT: mov v0.s[1], w9 20; CHECK-NEXT: cmeq v1.2s, v0.2s, #0 21; CHECK-NEXT: ushll v0.2d, v0.2s, #0 22; CHECK-NEXT: sshll v1.2d, v1.2s, #0 23; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 24; CHECK-NEXT: ldr q1, [x1] 25; CHECK-NEXT: st1b { z0.d }, p0, [z1.d] 26; CHECK-NEXT: ret 27 %vals = load <2 x i8>, ptr %a 28 %ptrs = load <2 x ptr>, ptr %b 29 %mask = icmp eq <2 x i8> %vals, zeroinitializer 30 call void @llvm.masked.scatter.v2i8(<2 x i8> %vals, <2 x ptr> %ptrs, i32 8, <2 x i1> %mask) 31 ret void 32} 33 34define void @masked_scatter_v4i8(ptr %a, ptr %b) vscale_range(2,0) #0 { 35; CHECK-LABEL: masked_scatter_v4i8: 36; CHECK: // %bb.0: 37; CHECK-NEXT: ldr s0, [x0] 38; CHECK-NEXT: ptrue p0.d, vl4 39; CHECK-NEXT: ushll v0.8h, v0.8b, #0 40; CHECK-NEXT: cmeq v1.4h, v0.4h, #0 41; CHECK-NEXT: uunpklo z0.s, z0.h 42; CHECK-NEXT: sunpklo z1.s, z1.h 43; CHECK-NEXT: uunpklo z0.d, z0.s 44; CHECK-NEXT: sunpklo z1.d, z1.s 45; CHECK-NEXT: cmpne p1.d, p0/z, z1.d, #0 46; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 47; CHECK-NEXT: st1b { z0.d }, p1, [z1.d] 48; CHECK-NEXT: ret 49 %vals = load <4 x i8>, ptr %a 50 %ptrs = load <4 x ptr>, ptr %b 51 %mask = icmp eq <4 x i8> %vals, zeroinitializer 52 call void @llvm.masked.scatter.v4i8(<4 x i8> %vals, <4 x ptr> %ptrs, i32 8, <4 x i1> %mask) 53 ret void 54} 55 56define void @masked_scatter_v8i8(ptr %a, ptr %b) #0 { 57; VBITS_GE_256-LABEL: masked_scatter_v8i8: 58; VBITS_GE_256: // %bb.0: 59; VBITS_GE_256-NEXT: ldr d0, [x0] 60; VBITS_GE_256-NEXT: ptrue p0.d, vl4 61; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 62; VBITS_GE_256-NEXT: cmeq v1.8b, v0.8b, #0 63; VBITS_GE_256-NEXT: zip1 v3.8b, v0.8b, v0.8b 64; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] 65; VBITS_GE_256-NEXT: zip1 v2.8b, v1.8b, v0.8b 66; VBITS_GE_256-NEXT: zip2 v1.8b, v1.8b, v0.8b 67; VBITS_GE_256-NEXT: zip2 v0.8b, v0.8b, v0.8b 68; VBITS_GE_256-NEXT: uunpklo z3.s, z3.h 69; VBITS_GE_256-NEXT: shl v2.4h, v2.4h, #8 70; VBITS_GE_256-NEXT: shl v1.4h, v1.4h, #8 71; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h 72; VBITS_GE_256-NEXT: uunpklo z3.d, z3.s 73; VBITS_GE_256-NEXT: sshr v2.4h, v2.4h, #8 74; VBITS_GE_256-NEXT: sshr v1.4h, v1.4h, #8 75; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s 76; VBITS_GE_256-NEXT: sunpklo z2.s, z2.h 77; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h 78; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s 79; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s 80; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z2.d, #0 81; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1] 82; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z1.d, #0 83; VBITS_GE_256-NEXT: st1b { z3.d }, p1, [z2.d] 84; VBITS_GE_256-NEXT: st1b { z0.d }, p0, [z4.d] 85; VBITS_GE_256-NEXT: ret 86; 87; VBITS_GE_512-LABEL: masked_scatter_v8i8: 88; VBITS_GE_512: // %bb.0: 89; VBITS_GE_512-NEXT: ldr d0, [x0] 90; VBITS_GE_512-NEXT: ptrue p0.d, vl8 91; VBITS_GE_512-NEXT: cmeq v1.8b, v0.8b, #0 92; VBITS_GE_512-NEXT: uunpklo z0.h, z0.b 93; VBITS_GE_512-NEXT: sunpklo z1.h, z1.b 94; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h 95; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h 96; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s 97; VBITS_GE_512-NEXT: sunpklo z1.d, z1.s 98; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z1.d, #0 99; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] 100; VBITS_GE_512-NEXT: st1b { z0.d }, p1, [z1.d] 101; VBITS_GE_512-NEXT: ret 102 %vals = load <8 x i8>, ptr %a 103 %ptrs = load <8 x ptr>, ptr %b 104 %mask = icmp eq <8 x i8> %vals, zeroinitializer 105 call void @llvm.masked.scatter.v8i8(<8 x i8> %vals, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask) 106 ret void 107} 108 109define void @masked_scatter_v16i8(ptr %a, ptr %b) vscale_range(8,0) #0 { 110; CHECK-LABEL: masked_scatter_v16i8: 111; CHECK: // %bb.0: 112; CHECK-NEXT: ldr q0, [x0] 113; CHECK-NEXT: ptrue p0.d, vl16 114; CHECK-NEXT: cmeq v1.16b, v0.16b, #0 115; CHECK-NEXT: uunpklo z0.h, z0.b 116; CHECK-NEXT: sunpklo z1.h, z1.b 117; CHECK-NEXT: uunpklo z0.s, z0.h 118; CHECK-NEXT: sunpklo z1.s, z1.h 119; CHECK-NEXT: uunpklo z0.d, z0.s 120; CHECK-NEXT: sunpklo z1.d, z1.s 121; CHECK-NEXT: cmpne p1.d, p0/z, z1.d, #0 122; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 123; CHECK-NEXT: st1b { z0.d }, p1, [z1.d] 124; CHECK-NEXT: ret 125 %vals = load <16 x i8>, ptr %a 126 %ptrs = load <16 x ptr>, ptr %b 127 %mask = icmp eq <16 x i8> %vals, zeroinitializer 128 call void @llvm.masked.scatter.v16i8(<16 x i8> %vals, <16 x ptr> %ptrs, i32 8, <16 x i1> %mask) 129 ret void 130} 131 132define void @masked_scatter_v32i8(ptr %a, ptr %b) vscale_range(16,0) #0 { 133; CHECK-LABEL: masked_scatter_v32i8: 134; CHECK: // %bb.0: 135; CHECK-NEXT: ptrue p0.b, vl32 136; CHECK-NEXT: ptrue p1.d, vl32 137; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 138; CHECK-NEXT: uunpklo z1.h, z0.b 139; CHECK-NEXT: cmpeq p0.b, p0/z, z0.b, #0 140; CHECK-NEXT: uunpklo z0.s, z1.h 141; CHECK-NEXT: punpklo p0.h, p0.b 142; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] 143; CHECK-NEXT: punpklo p0.h, p0.b 144; CHECK-NEXT: uunpklo z0.d, z0.s 145; CHECK-NEXT: punpklo p0.h, p0.b 146; CHECK-NEXT: st1b { z0.d }, p0, [z1.d] 147; CHECK-NEXT: ret 148 %vals = load <32 x i8>, ptr %a 149 %ptrs = load <32 x ptr>, ptr %b 150 %mask = icmp eq <32 x i8> %vals, zeroinitializer 151 call void @llvm.masked.scatter.v32i8(<32 x i8> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask) 152 ret void 153} 154 155; 156; ST1H 157; 158 159define void @masked_scatter_v2i16(ptr %a, ptr %b) vscale_range(2,0) #0 { 160; CHECK-LABEL: masked_scatter_v2i16: 161; CHECK: // %bb.0: 162; CHECK-NEXT: ldrh w8, [x0] 163; CHECK-NEXT: ldrh w9, [x0, #2] 164; CHECK-NEXT: ptrue p0.d, vl2 165; CHECK-NEXT: fmov s0, w8 166; CHECK-NEXT: mov v0.s[1], w9 167; CHECK-NEXT: cmeq v1.2s, v0.2s, #0 168; CHECK-NEXT: ushll v0.2d, v0.2s, #0 169; CHECK-NEXT: sshll v1.2d, v1.2s, #0 170; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 171; CHECK-NEXT: ldr q1, [x1] 172; CHECK-NEXT: st1h { z0.d }, p0, [z1.d] 173; CHECK-NEXT: ret 174 %vals = load <2 x i16>, ptr %a 175 %ptrs = load <2 x ptr>, ptr %b 176 %mask = icmp eq <2 x i16> %vals, zeroinitializer 177 call void @llvm.masked.scatter.v2i16(<2 x i16> %vals, <2 x ptr> %ptrs, i32 8, <2 x i1> %mask) 178 ret void 179} 180 181define void @masked_scatter_v4i16(ptr %a, ptr %b) vscale_range(2,0) #0 { 182; CHECK-LABEL: masked_scatter_v4i16: 183; CHECK: // %bb.0: 184; CHECK-NEXT: ldr d0, [x0] 185; CHECK-NEXT: ptrue p0.d, vl4 186; CHECK-NEXT: cmeq v1.4h, v0.4h, #0 187; CHECK-NEXT: uunpklo z0.s, z0.h 188; CHECK-NEXT: sunpklo z1.s, z1.h 189; CHECK-NEXT: uunpklo z0.d, z0.s 190; CHECK-NEXT: sunpklo z1.d, z1.s 191; CHECK-NEXT: cmpne p1.d, p0/z, z1.d, #0 192; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 193; CHECK-NEXT: st1h { z0.d }, p1, [z1.d] 194; CHECK-NEXT: ret 195 %vals = load <4 x i16>, ptr %a 196 %ptrs = load <4 x ptr>, ptr %b 197 %mask = icmp eq <4 x i16> %vals, zeroinitializer 198 call void @llvm.masked.scatter.v4i16(<4 x i16> %vals, <4 x ptr> %ptrs, i32 8, <4 x i1> %mask) 199 ret void 200} 201 202define void @masked_scatter_v8i16(ptr %a, ptr %b) #0 { 203; VBITS_GE_256-LABEL: masked_scatter_v8i16: 204; VBITS_GE_256: // %bb.0: 205; VBITS_GE_256-NEXT: ldr q0, [x0] 206; VBITS_GE_256-NEXT: ptrue p0.d, vl4 207; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 208; VBITS_GE_256-NEXT: cmeq v1.8h, v0.8h, #0 209; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h 210; VBITS_GE_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8 211; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] 212; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h 213; VBITS_GE_256-NEXT: ext v1.16b, v1.16b, v1.16b, #8 214; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h 215; VBITS_GE_256-NEXT: uunpklo z3.d, z3.s 216; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h 217; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s 218; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s 219; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s 220; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z2.d, #0 221; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1] 222; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z1.d, #0 223; VBITS_GE_256-NEXT: st1h { z3.d }, p1, [z2.d] 224; VBITS_GE_256-NEXT: st1h { z0.d }, p0, [z4.d] 225; VBITS_GE_256-NEXT: ret 226; 227; VBITS_GE_512-LABEL: masked_scatter_v8i16: 228; VBITS_GE_512: // %bb.0: 229; VBITS_GE_512-NEXT: ldr q0, [x0] 230; VBITS_GE_512-NEXT: ptrue p0.d, vl8 231; VBITS_GE_512-NEXT: cmeq v1.8h, v0.8h, #0 232; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h 233; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h 234; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s 235; VBITS_GE_512-NEXT: sunpklo z1.d, z1.s 236; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z1.d, #0 237; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] 238; VBITS_GE_512-NEXT: st1h { z0.d }, p1, [z1.d] 239; VBITS_GE_512-NEXT: ret 240 %vals = load <8 x i16>, ptr %a 241 %ptrs = load <8 x ptr>, ptr %b 242 %mask = icmp eq <8 x i16> %vals, zeroinitializer 243 call void @llvm.masked.scatter.v8i16(<8 x i16> %vals, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask) 244 ret void 245} 246 247define void @masked_scatter_v16i16(ptr %a, ptr %b) vscale_range(8,0) #0 { 248; CHECK-LABEL: masked_scatter_v16i16: 249; CHECK: // %bb.0: 250; CHECK-NEXT: ptrue p0.h, vl16 251; CHECK-NEXT: ptrue p1.d, vl16 252; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 253; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] 254; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 255; CHECK-NEXT: uunpklo z0.s, z0.h 256; CHECK-NEXT: uunpklo z0.d, z0.s 257; CHECK-NEXT: punpklo p0.h, p0.b 258; CHECK-NEXT: punpklo p0.h, p0.b 259; CHECK-NEXT: st1h { z0.d }, p0, [z1.d] 260; CHECK-NEXT: ret 261 %vals = load <16 x i16>, ptr %a 262 %ptrs = load <16 x ptr>, ptr %b 263 %mask = icmp eq <16 x i16> %vals, zeroinitializer 264 call void @llvm.masked.scatter.v16i16(<16 x i16> %vals, <16 x ptr> %ptrs, i32 8, <16 x i1> %mask) 265 ret void 266} 267 268define void @masked_scatter_v32i16(ptr %a, ptr %b) vscale_range(16,0) #0 { 269; CHECK-LABEL: masked_scatter_v32i16: 270; CHECK: // %bb.0: 271; CHECK-NEXT: ptrue p0.h, vl32 272; CHECK-NEXT: ptrue p1.d, vl32 273; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 274; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] 275; CHECK-NEXT: cmpeq p0.h, p0/z, z0.h, #0 276; CHECK-NEXT: uunpklo z0.s, z0.h 277; CHECK-NEXT: uunpklo z0.d, z0.s 278; CHECK-NEXT: punpklo p0.h, p0.b 279; CHECK-NEXT: punpklo p0.h, p0.b 280; CHECK-NEXT: st1h { z0.d }, p0, [z1.d] 281; CHECK-NEXT: ret 282 %vals = load <32 x i16>, ptr %a 283 %ptrs = load <32 x ptr>, ptr %b 284 %mask = icmp eq <32 x i16> %vals, zeroinitializer 285 call void @llvm.masked.scatter.v32i16(<32 x i16> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask) 286 ret void 287} 288 289; 290; ST1W 291; 292 293define void @masked_scatter_v2i32(ptr %a, ptr %b) vscale_range(2,0) #0 { 294; CHECK-LABEL: masked_scatter_v2i32: 295; CHECK: // %bb.0: 296; CHECK-NEXT: ldr d0, [x0] 297; CHECK-NEXT: ptrue p0.d, vl2 298; CHECK-NEXT: cmeq v1.2s, v0.2s, #0 299; CHECK-NEXT: ushll v0.2d, v0.2s, #0 300; CHECK-NEXT: sshll v1.2d, v1.2s, #0 301; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 302; CHECK-NEXT: ldr q1, [x1] 303; CHECK-NEXT: st1w { z0.d }, p0, [z1.d] 304; CHECK-NEXT: ret 305 %vals = load <2 x i32>, ptr %a 306 %ptrs = load <2 x ptr>, ptr %b 307 %mask = icmp eq <2 x i32> %vals, zeroinitializer 308 call void @llvm.masked.scatter.v2i32(<2 x i32> %vals, <2 x ptr> %ptrs, i32 8, <2 x i1> %mask) 309 ret void 310} 311 312define void @masked_scatter_v4i32(ptr %a, ptr %b) vscale_range(2,0) #0 { 313; CHECK-LABEL: masked_scatter_v4i32: 314; CHECK: // %bb.0: 315; CHECK-NEXT: ldr q0, [x0] 316; CHECK-NEXT: ptrue p0.d, vl4 317; CHECK-NEXT: cmeq v1.4s, v0.4s, #0 318; CHECK-NEXT: uunpklo z0.d, z0.s 319; CHECK-NEXT: sunpklo z1.d, z1.s 320; CHECK-NEXT: cmpne p1.d, p0/z, z1.d, #0 321; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 322; CHECK-NEXT: st1w { z0.d }, p1, [z1.d] 323; CHECK-NEXT: ret 324 %vals = load <4 x i32>, ptr %a 325 %ptrs = load <4 x ptr>, ptr %b 326 %mask = icmp eq <4 x i32> %vals, zeroinitializer 327 call void @llvm.masked.scatter.v4i32(<4 x i32> %vals, <4 x ptr> %ptrs, i32 8, <4 x i1> %mask) 328 ret void 329} 330 331define void @masked_scatter_v8i32(ptr %a, ptr %b) #0 { 332; VBITS_GE_256-LABEL: masked_scatter_v8i32: 333; VBITS_GE_256: // %bb.0: 334; VBITS_GE_256-NEXT: ptrue p0.s, vl8 335; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 336; VBITS_GE_256-NEXT: ptrue p1.d, vl4 337; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] 338; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1] 339; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x8, lsl #3] 340; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0 341; VBITS_GE_256-NEXT: uunpklo z2.d, z0.s 342; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 343; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s 344; VBITS_GE_256-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff 345; VBITS_GE_256-NEXT: punpklo p0.h, p0.b 346; VBITS_GE_256-NEXT: and p0.b, p0/z, p0.b, p1.b 347; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 348; VBITS_GE_256-NEXT: st1w { z2.d }, p0, [z3.d] 349; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s 350; VBITS_GE_256-NEXT: cmpne p0.d, p1/z, z1.d, #0 351; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [z4.d] 352; VBITS_GE_256-NEXT: ret 353; 354; VBITS_GE_512-LABEL: masked_scatter_v8i32: 355; VBITS_GE_512: // %bb.0: 356; VBITS_GE_512-NEXT: ptrue p0.s, vl8 357; VBITS_GE_512-NEXT: ptrue p1.d, vl8 358; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 359; VBITS_GE_512-NEXT: ld1d { z1.d }, p1/z, [x1] 360; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, #0 361; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s 362; VBITS_GE_512-NEXT: punpklo p0.h, p0.b 363; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [z1.d] 364; VBITS_GE_512-NEXT: ret 365 %vals = load <8 x i32>, ptr %a 366 %ptrs = load <8 x ptr>, ptr %b 367 %mask = icmp eq <8 x i32> %vals, zeroinitializer 368 call void @llvm.masked.scatter.v8i32(<8 x i32> %vals, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask) 369 ret void 370} 371 372define void @masked_scatter_v16i32(ptr %a, ptr %b) vscale_range(8,0) #0 { 373; CHECK-LABEL: masked_scatter_v16i32: 374; CHECK: // %bb.0: 375; CHECK-NEXT: ptrue p0.s, vl16 376; CHECK-NEXT: ptrue p1.d, vl16 377; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 378; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] 379; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0 380; CHECK-NEXT: uunpklo z0.d, z0.s 381; CHECK-NEXT: punpklo p0.h, p0.b 382; CHECK-NEXT: st1w { z0.d }, p0, [z1.d] 383; CHECK-NEXT: ret 384 %vals = load <16 x i32>, ptr %a 385 %ptrs = load <16 x ptr>, ptr %b 386 %mask = icmp eq <16 x i32> %vals, zeroinitializer 387 call void @llvm.masked.scatter.v16i32(<16 x i32> %vals, <16 x ptr> %ptrs, i32 8, <16 x i1> %mask) 388 ret void 389} 390 391define void @masked_scatter_v32i32(ptr %a, ptr %b) vscale_range(16,0) #0 { 392; CHECK-LABEL: masked_scatter_v32i32: 393; CHECK: // %bb.0: 394; CHECK-NEXT: ptrue p0.s, vl32 395; CHECK-NEXT: ptrue p1.d, vl32 396; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 397; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] 398; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, #0 399; CHECK-NEXT: uunpklo z0.d, z0.s 400; CHECK-NEXT: punpklo p0.h, p0.b 401; CHECK-NEXT: st1w { z0.d }, p0, [z1.d] 402; CHECK-NEXT: ret 403 %vals = load <32 x i32>, ptr %a 404 %ptrs = load <32 x ptr>, ptr %b 405 %mask = icmp eq <32 x i32> %vals, zeroinitializer 406 call void @llvm.masked.scatter.v32i32(<32 x i32> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask) 407 ret void 408} 409 410; 411; ST1D 412; 413 414; Scalarize 1 x i64 scatters 415define void @masked_scatter_v1i64(ptr %a, ptr %b) vscale_range(2,0) #0 { 416; CHECK-LABEL: masked_scatter_v1i64: 417; CHECK: // %bb.0: 418; CHECK-NEXT: ldr d0, [x0] 419; CHECK-NEXT: fmov x8, d0 420; CHECK-NEXT: cbnz x8, .LBB15_2 421; CHECK-NEXT: // %bb.1: // %cond.store 422; CHECK-NEXT: ldr d1, [x1] 423; CHECK-NEXT: fmov x8, d1 424; CHECK-NEXT: str d0, [x8] 425; CHECK-NEXT: .LBB15_2: // %else 426; CHECK-NEXT: ret 427 %vals = load <1 x i64>, ptr %a 428 %ptrs = load <1 x ptr>, ptr %b 429 %mask = icmp eq <1 x i64> %vals, zeroinitializer 430 call void @llvm.masked.scatter.v1i64(<1 x i64> %vals, <1 x ptr> %ptrs, i32 8, <1 x i1> %mask) 431 ret void 432} 433 434define void @masked_scatter_v2i64(ptr %a, ptr %b) vscale_range(2,0) #0 { 435; CHECK-LABEL: masked_scatter_v2i64: 436; CHECK: // %bb.0: 437; CHECK-NEXT: ldr q0, [x0] 438; CHECK-NEXT: ptrue p0.d, vl2 439; CHECK-NEXT: cmeq v1.2d, v0.2d, #0 440; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 441; CHECK-NEXT: ldr q1, [x1] 442; CHECK-NEXT: st1d { z0.d }, p0, [z1.d] 443; CHECK-NEXT: ret 444 %vals = load <2 x i64>, ptr %a 445 %ptrs = load <2 x ptr>, ptr %b 446 %mask = icmp eq <2 x i64> %vals, zeroinitializer 447 call void @llvm.masked.scatter.v2i64(<2 x i64> %vals, <2 x ptr> %ptrs, i32 8, <2 x i1> %mask) 448 ret void 449} 450 451define void @masked_scatter_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { 452; CHECK-LABEL: masked_scatter_v4i64: 453; CHECK: // %bb.0: 454; CHECK-NEXT: ptrue p0.d, vl4 455; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 456; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 457; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 458; CHECK-NEXT: st1d { z0.d }, p1, [z1.d] 459; CHECK-NEXT: ret 460 %vals = load <4 x i64>, ptr %a 461 %ptrs = load <4 x ptr>, ptr %b 462 %mask = icmp eq <4 x i64> %vals, zeroinitializer 463 call void @llvm.masked.scatter.v4i64(<4 x i64> %vals, <4 x ptr> %ptrs, i32 8, <4 x i1> %mask) 464 ret void 465} 466 467define void @masked_scatter_v8i64(ptr %a, ptr %b) #0 { 468; VBITS_GE_256-LABEL: masked_scatter_v8i64: 469; VBITS_GE_256: // %bb.0: 470; VBITS_GE_256-NEXT: ptrue p0.d, vl4 471; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 472; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0] 473; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x8, lsl #3] 474; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] 475; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1, x8, lsl #3] 476; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0 477; VBITS_GE_256-NEXT: cmpeq p0.d, p0/z, z2.d, #0 478; VBITS_GE_256-NEXT: st1d { z0.d }, p1, [z1.d] 479; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [z3.d] 480; VBITS_GE_256-NEXT: ret 481; 482; VBITS_GE_512-LABEL: masked_scatter_v8i64: 483; VBITS_GE_512: // %bb.0: 484; VBITS_GE_512-NEXT: ptrue p0.d, vl8 485; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 486; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] 487; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 488; VBITS_GE_512-NEXT: st1d { z0.d }, p1, [z1.d] 489; VBITS_GE_512-NEXT: ret 490 %vals = load <8 x i64>, ptr %a 491 %ptrs = load <8 x ptr>, ptr %b 492 %mask = icmp eq <8 x i64> %vals, zeroinitializer 493 call void @llvm.masked.scatter.v8i64(<8 x i64> %vals, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask) 494 ret void 495} 496 497define void @masked_scatter_v16i64(ptr %a, ptr %b) vscale_range(8,0) #0 { 498; CHECK-LABEL: masked_scatter_v16i64: 499; CHECK: // %bb.0: 500; CHECK-NEXT: ptrue p0.d, vl16 501; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 502; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 503; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 504; CHECK-NEXT: st1d { z0.d }, p1, [z1.d] 505; CHECK-NEXT: ret 506 %vals = load <16 x i64>, ptr %a 507 %ptrs = load <16 x ptr>, ptr %b 508 %mask = icmp eq <16 x i64> %vals, zeroinitializer 509 call void @llvm.masked.scatter.v16i64(<16 x i64> %vals, <16 x ptr> %ptrs, i32 8, <16 x i1> %mask) 510 ret void 511} 512 513define void @masked_scatter_v32i64(ptr %a, ptr %b) vscale_range(16,0) #0 { 514; CHECK-LABEL: masked_scatter_v32i64: 515; CHECK: // %bb.0: 516; CHECK-NEXT: ptrue p0.d, vl32 517; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 518; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 519; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 520; CHECK-NEXT: st1d { z0.d }, p1, [z1.d] 521; CHECK-NEXT: ret 522 %vals = load <32 x i64>, ptr %a 523 %ptrs = load <32 x ptr>, ptr %b 524 %mask = icmp eq <32 x i64> %vals, zeroinitializer 525 call void @llvm.masked.scatter.v32i64(<32 x i64> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask) 526 ret void 527} 528 529; 530; ST1H (float) 531; 532 533define void @masked_scatter_v2f16(ptr %a, ptr %b) vscale_range(2,0) #0 { 534; CHECK-LABEL: masked_scatter_v2f16: 535; CHECK: // %bb.0: 536; CHECK-NEXT: ldr s1, [x0] 537; CHECK-NEXT: movi v0.2d, #0000000000000000 538; CHECK-NEXT: ptrue p0.d, vl4 539; CHECK-NEXT: fcmeq v2.4h, v1.4h, #0.0 540; CHECK-NEXT: uunpklo z1.s, z1.h 541; CHECK-NEXT: sshll v2.4s, v2.4h, #0 542; CHECK-NEXT: mov v0.h[0], v2.h[0] 543; CHECK-NEXT: mov w8, v2.s[1] 544; CHECK-NEXT: mov v0.h[1], w8 545; CHECK-NEXT: sunpklo z0.s, z0.h 546; CHECK-NEXT: sunpklo z0.d, z0.s 547; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 548; CHECK-NEXT: uunpklo z0.d, z1.s 549; CHECK-NEXT: ldr q1, [x1] 550; CHECK-NEXT: st1h { z0.d }, p0, [z1.d] 551; CHECK-NEXT: ret 552 %vals = load <2 x half>, ptr %a 553 %ptrs = load <2 x ptr>, ptr %b 554 %mask = fcmp oeq <2 x half> %vals, zeroinitializer 555 call void @llvm.masked.scatter.v2f16(<2 x half> %vals, <2 x ptr> %ptrs, i32 8, <2 x i1> %mask) 556 ret void 557} 558 559define void @masked_scatter_v4f16(ptr %a, ptr %b) vscale_range(2,0) #0 { 560; CHECK-LABEL: masked_scatter_v4f16: 561; CHECK: // %bb.0: 562; CHECK-NEXT: ldr d0, [x0] 563; CHECK-NEXT: ptrue p0.d, vl4 564; CHECK-NEXT: fcmeq v1.4h, v0.4h, #0.0 565; CHECK-NEXT: uunpklo z0.s, z0.h 566; CHECK-NEXT: sunpklo z1.s, z1.h 567; CHECK-NEXT: uunpklo z0.d, z0.s 568; CHECK-NEXT: sunpklo z1.d, z1.s 569; CHECK-NEXT: cmpne p1.d, p0/z, z1.d, #0 570; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 571; CHECK-NEXT: st1h { z0.d }, p1, [z1.d] 572; CHECK-NEXT: ret 573 %vals = load <4 x half>, ptr %a 574 %ptrs = load <4 x ptr>, ptr %b 575 %mask = fcmp oeq <4 x half> %vals, zeroinitializer 576 call void @llvm.masked.scatter.v4f16(<4 x half> %vals, <4 x ptr> %ptrs, i32 8, <4 x i1> %mask) 577 ret void 578} 579 580define void @masked_scatter_v8f16(ptr %a, ptr %b) #0 { 581; VBITS_GE_256-LABEL: masked_scatter_v8f16: 582; VBITS_GE_256: // %bb.0: 583; VBITS_GE_256-NEXT: ldr q0, [x0] 584; VBITS_GE_256-NEXT: ptrue p0.d, vl4 585; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 586; VBITS_GE_256-NEXT: fcmeq v1.8h, v0.8h, #0.0 587; VBITS_GE_256-NEXT: uunpklo z3.s, z0.h 588; VBITS_GE_256-NEXT: ext v0.16b, v0.16b, v0.16b, #8 589; VBITS_GE_256-NEXT: ld1d { z4.d }, p0/z, [x1, x8, lsl #3] 590; VBITS_GE_256-NEXT: sunpklo z2.s, z1.h 591; VBITS_GE_256-NEXT: ext v1.16b, v1.16b, v1.16b, #8 592; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h 593; VBITS_GE_256-NEXT: uunpklo z3.d, z3.s 594; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h 595; VBITS_GE_256-NEXT: sunpklo z2.d, z2.s 596; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s 597; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s 598; VBITS_GE_256-NEXT: cmpne p1.d, p0/z, z2.d, #0 599; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1] 600; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z1.d, #0 601; VBITS_GE_256-NEXT: st1h { z3.d }, p1, [z2.d] 602; VBITS_GE_256-NEXT: st1h { z0.d }, p0, [z4.d] 603; VBITS_GE_256-NEXT: ret 604; 605; VBITS_GE_512-LABEL: masked_scatter_v8f16: 606; VBITS_GE_512: // %bb.0: 607; VBITS_GE_512-NEXT: ldr q0, [x0] 608; VBITS_GE_512-NEXT: ptrue p0.d, vl8 609; VBITS_GE_512-NEXT: fcmeq v1.8h, v0.8h, #0.0 610; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h 611; VBITS_GE_512-NEXT: sunpklo z1.s, z1.h 612; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s 613; VBITS_GE_512-NEXT: sunpklo z1.d, z1.s 614; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z1.d, #0 615; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] 616; VBITS_GE_512-NEXT: st1h { z0.d }, p1, [z1.d] 617; VBITS_GE_512-NEXT: ret 618 %vals = load <8 x half>, ptr %a 619 %ptrs = load <8 x ptr>, ptr %b 620 %mask = fcmp oeq <8 x half> %vals, zeroinitializer 621 call void @llvm.masked.scatter.v8f16(<8 x half> %vals, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask) 622 ret void 623} 624 625define void @masked_scatter_v16f16(ptr %a, ptr %b) vscale_range(8,0) #0 { 626; CHECK-LABEL: masked_scatter_v16f16: 627; CHECK: // %bb.0: 628; CHECK-NEXT: ptrue p0.h, vl16 629; CHECK-NEXT: ptrue p1.d, vl16 630; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 631; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] 632; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 633; CHECK-NEXT: uunpklo z0.s, z0.h 634; CHECK-NEXT: uunpklo z0.d, z0.s 635; CHECK-NEXT: punpklo p0.h, p0.b 636; CHECK-NEXT: punpklo p0.h, p0.b 637; CHECK-NEXT: st1h { z0.d }, p0, [z1.d] 638; CHECK-NEXT: ret 639 %vals = load <16 x half>, ptr %a 640 %ptrs = load <16 x ptr>, ptr %b 641 %mask = fcmp oeq <16 x half> %vals, zeroinitializer 642 call void @llvm.masked.scatter.v16f16(<16 x half> %vals, <16 x ptr> %ptrs, i32 8, <16 x i1> %mask) 643 ret void 644} 645 646define void @masked_scatter_v32f16(ptr %a, ptr %b) vscale_range(16,0) #0 { 647; CHECK-LABEL: masked_scatter_v32f16: 648; CHECK: // %bb.0: 649; CHECK-NEXT: ptrue p0.h, vl32 650; CHECK-NEXT: ptrue p1.d, vl32 651; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 652; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] 653; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 654; CHECK-NEXT: uunpklo z0.s, z0.h 655; CHECK-NEXT: uunpklo z0.d, z0.s 656; CHECK-NEXT: punpklo p0.h, p0.b 657; CHECK-NEXT: punpklo p0.h, p0.b 658; CHECK-NEXT: st1h { z0.d }, p0, [z1.d] 659; CHECK-NEXT: ret 660 %vals = load <32 x half>, ptr %a 661 %ptrs = load <32 x ptr>, ptr %b 662 %mask = fcmp oeq <32 x half> %vals, zeroinitializer 663 call void @llvm.masked.scatter.v32f16(<32 x half> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask) 664 ret void 665} 666 667; 668; ST1W (float) 669; 670 671define void @masked_scatter_v2f32(ptr %a, ptr %b) vscale_range(2,0) #0 { 672; CHECK-LABEL: masked_scatter_v2f32: 673; CHECK: // %bb.0: 674; CHECK-NEXT: ldr d0, [x0] 675; CHECK-NEXT: ptrue p0.d, vl2 676; CHECK-NEXT: fcmeq v1.2s, v0.2s, #0.0 677; CHECK-NEXT: ushll v0.2d, v0.2s, #0 678; CHECK-NEXT: sshll v1.2d, v1.2s, #0 679; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 680; CHECK-NEXT: ldr q1, [x1] 681; CHECK-NEXT: st1w { z0.d }, p0, [z1.d] 682; CHECK-NEXT: ret 683 %vals = load <2 x float>, ptr %a 684 %ptrs = load <2 x ptr>, ptr %b 685 %mask = fcmp oeq <2 x float> %vals, zeroinitializer 686 call void @llvm.masked.scatter.v2f32(<2 x float> %vals, <2 x ptr> %ptrs, i32 8, <2 x i1> %mask) 687 ret void 688} 689 690define void @masked_scatter_v4f32(ptr %a, ptr %b) vscale_range(2,0) #0 { 691; CHECK-LABEL: masked_scatter_v4f32: 692; CHECK: // %bb.0: 693; CHECK-NEXT: ldr q0, [x0] 694; CHECK-NEXT: ptrue p0.d, vl4 695; CHECK-NEXT: fcmeq v1.4s, v0.4s, #0.0 696; CHECK-NEXT: uunpklo z0.d, z0.s 697; CHECK-NEXT: sunpklo z1.d, z1.s 698; CHECK-NEXT: cmpne p1.d, p0/z, z1.d, #0 699; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 700; CHECK-NEXT: st1w { z0.d }, p1, [z1.d] 701; CHECK-NEXT: ret 702 %vals = load <4 x float>, ptr %a 703 %ptrs = load <4 x ptr>, ptr %b 704 %mask = fcmp oeq <4 x float> %vals, zeroinitializer 705 call void @llvm.masked.scatter.v4f32(<4 x float> %vals, <4 x ptr> %ptrs, i32 8, <4 x i1> %mask) 706 ret void 707} 708 709define void @masked_scatter_v8f32(ptr %a, ptr %b) #0 { 710; VBITS_GE_256-LABEL: masked_scatter_v8f32: 711; VBITS_GE_256: // %bb.0: 712; VBITS_GE_256-NEXT: ptrue p0.s, vl8 713; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 714; VBITS_GE_256-NEXT: ptrue p1.d, vl4 715; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] 716; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1] 717; VBITS_GE_256-NEXT: ld1d { z4.d }, p1/z, [x1, x8, lsl #3] 718; VBITS_GE_256-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 719; VBITS_GE_256-NEXT: uunpklo z2.d, z0.s 720; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 721; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s 722; VBITS_GE_256-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff 723; VBITS_GE_256-NEXT: punpklo p0.h, p0.b 724; VBITS_GE_256-NEXT: and p0.b, p0/z, p0.b, p1.b 725; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16 726; VBITS_GE_256-NEXT: st1w { z2.d }, p0, [z3.d] 727; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s 728; VBITS_GE_256-NEXT: cmpne p0.d, p1/z, z1.d, #0 729; VBITS_GE_256-NEXT: st1w { z0.d }, p0, [z4.d] 730; VBITS_GE_256-NEXT: ret 731; 732; VBITS_GE_512-LABEL: masked_scatter_v8f32: 733; VBITS_GE_512: // %bb.0: 734; VBITS_GE_512-NEXT: ptrue p0.s, vl8 735; VBITS_GE_512-NEXT: ptrue p1.d, vl8 736; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 737; VBITS_GE_512-NEXT: ld1d { z1.d }, p1/z, [x1] 738; VBITS_GE_512-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 739; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s 740; VBITS_GE_512-NEXT: punpklo p0.h, p0.b 741; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [z1.d] 742; VBITS_GE_512-NEXT: ret 743 %vals = load <8 x float>, ptr %a 744 %ptrs = load <8 x ptr>, ptr %b 745 %mask = fcmp oeq <8 x float> %vals, zeroinitializer 746 call void @llvm.masked.scatter.v8f32(<8 x float> %vals, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask) 747 ret void 748} 749 750define void @masked_scatter_v16f32(ptr %a, ptr %b) vscale_range(8,0) #0 { 751; CHECK-LABEL: masked_scatter_v16f32: 752; CHECK: // %bb.0: 753; CHECK-NEXT: ptrue p0.s, vl16 754; CHECK-NEXT: ptrue p1.d, vl16 755; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 756; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] 757; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 758; CHECK-NEXT: uunpklo z0.d, z0.s 759; CHECK-NEXT: punpklo p0.h, p0.b 760; CHECK-NEXT: st1w { z0.d }, p0, [z1.d] 761; CHECK-NEXT: ret 762 %vals = load <16 x float>, ptr %a 763 %ptrs = load <16 x ptr>, ptr %b 764 %mask = fcmp oeq <16 x float> %vals, zeroinitializer 765 call void @llvm.masked.scatter.v16f32(<16 x float> %vals, <16 x ptr> %ptrs, i32 8, <16 x i1> %mask) 766 ret void 767} 768 769define void @masked_scatter_v32f32(ptr %a, ptr %b) vscale_range(16,0) #0 { 770; CHECK-LABEL: masked_scatter_v32f32: 771; CHECK: // %bb.0: 772; CHECK-NEXT: ptrue p0.s, vl32 773; CHECK-NEXT: ptrue p1.d, vl32 774; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 775; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] 776; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 777; CHECK-NEXT: uunpklo z0.d, z0.s 778; CHECK-NEXT: punpklo p0.h, p0.b 779; CHECK-NEXT: st1w { z0.d }, p0, [z1.d] 780; CHECK-NEXT: ret 781 %vals = load <32 x float>, ptr %a 782 %ptrs = load <32 x ptr>, ptr %b 783 %mask = fcmp oeq <32 x float> %vals, zeroinitializer 784 call void @llvm.masked.scatter.v32f32(<32 x float> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask) 785 ret void 786} 787 788; 789; ST1D (float) 790; 791 792; Scalarize 1 x double scatters 793define void @masked_scatter_v1f64(ptr %a, ptr %b) vscale_range(8,0) #0 { 794; CHECK-LABEL: masked_scatter_v1f64: 795; CHECK: // %bb.0: 796; CHECK-NEXT: ldr d0, [x0] 797; CHECK-NEXT: fcmp d0, #0.0 798; CHECK-NEXT: b.ne .LBB31_2 799; CHECK-NEXT: // %bb.1: // %cond.store 800; CHECK-NEXT: ldr d1, [x1] 801; CHECK-NEXT: fmov x8, d1 802; CHECK-NEXT: str d0, [x8] 803; CHECK-NEXT: .LBB31_2: // %else 804; CHECK-NEXT: ret 805 %vals = load <1 x double>, ptr %a 806 %ptrs = load <1 x ptr>, ptr %b 807 %mask = fcmp oeq <1 x double> %vals, zeroinitializer 808 call void @llvm.masked.scatter.v1f64(<1 x double> %vals, <1 x ptr> %ptrs, i32 8, <1 x i1> %mask) 809 ret void 810} 811 812define void @masked_scatter_v2f64(ptr %a, ptr %b) vscale_range(2,0) #0 { 813; CHECK-LABEL: masked_scatter_v2f64: 814; CHECK: // %bb.0: 815; CHECK-NEXT: ldr q0, [x0] 816; CHECK-NEXT: ptrue p0.d, vl2 817; CHECK-NEXT: fcmeq v1.2d, v0.2d, #0.0 818; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 819; CHECK-NEXT: ldr q1, [x1] 820; CHECK-NEXT: st1d { z0.d }, p0, [z1.d] 821; CHECK-NEXT: ret 822 %vals = load <2 x double>, ptr %a 823 %ptrs = load <2 x ptr>, ptr %b 824 %mask = fcmp oeq <2 x double> %vals, zeroinitializer 825 call void @llvm.masked.scatter.v2f64(<2 x double> %vals, <2 x ptr> %ptrs, i32 8, <2 x i1> %mask) 826 ret void 827} 828 829define void @masked_scatter_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { 830; CHECK-LABEL: masked_scatter_v4f64: 831; CHECK: // %bb.0: 832; CHECK-NEXT: ptrue p0.d, vl4 833; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 834; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 835; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 836; CHECK-NEXT: st1d { z0.d }, p1, [z1.d] 837; CHECK-NEXT: ret 838 %vals = load <4 x double>, ptr %a 839 %ptrs = load <4 x ptr>, ptr %b 840 %mask = fcmp oeq <4 x double> %vals, zeroinitializer 841 call void @llvm.masked.scatter.v4f64(<4 x double> %vals, <4 x ptr> %ptrs, i32 8, <4 x i1> %mask) 842 ret void 843} 844 845define void @masked_scatter_v8f64(ptr %a, ptr %b) #0 { 846; VBITS_GE_256-LABEL: masked_scatter_v8f64: 847; VBITS_GE_256: // %bb.0: 848; VBITS_GE_256-NEXT: ptrue p0.d, vl4 849; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 850; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0] 851; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0, x8, lsl #3] 852; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] 853; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1, x8, lsl #3] 854; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 855; VBITS_GE_256-NEXT: fcmeq p0.d, p0/z, z2.d, #0.0 856; VBITS_GE_256-NEXT: st1d { z0.d }, p1, [z1.d] 857; VBITS_GE_256-NEXT: st1d { z2.d }, p0, [z3.d] 858; VBITS_GE_256-NEXT: ret 859; 860; VBITS_GE_512-LABEL: masked_scatter_v8f64: 861; VBITS_GE_512: // %bb.0: 862; VBITS_GE_512-NEXT: ptrue p0.d, vl8 863; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 864; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] 865; VBITS_GE_512-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 866; VBITS_GE_512-NEXT: st1d { z0.d }, p1, [z1.d] 867; VBITS_GE_512-NEXT: ret 868 %vals = load <8 x double>, ptr %a 869 %ptrs = load <8 x ptr>, ptr %b 870 %mask = fcmp oeq <8 x double> %vals, zeroinitializer 871 call void @llvm.masked.scatter.v8f64(<8 x double> %vals, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask) 872 ret void 873} 874 875define void @masked_scatter_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 { 876; CHECK-LABEL: masked_scatter_v16f64: 877; CHECK: // %bb.0: 878; CHECK-NEXT: ptrue p0.d, vl16 879; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 880; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 881; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 882; CHECK-NEXT: st1d { z0.d }, p1, [z1.d] 883; CHECK-NEXT: ret 884 %vals = load <16 x double>, ptr %a 885 %ptrs = load <16 x ptr>, ptr %b 886 %mask = fcmp oeq <16 x double> %vals, zeroinitializer 887 call void @llvm.masked.scatter.v16f64(<16 x double> %vals, <16 x ptr> %ptrs, i32 8, <16 x i1> %mask) 888 ret void 889} 890 891define void @masked_scatter_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 { 892; CHECK-LABEL: masked_scatter_v32f64: 893; CHECK: // %bb.0: 894; CHECK-NEXT: ptrue p0.d, vl32 895; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 896; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 897; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 898; CHECK-NEXT: st1d { z0.d }, p1, [z1.d] 899; CHECK-NEXT: ret 900 %vals = load <32 x double>, ptr %a 901 %ptrs = load <32 x ptr>, ptr %b 902 %mask = fcmp oeq <32 x double> %vals, zeroinitializer 903 call void @llvm.masked.scatter.v32f64(<32 x double> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask) 904 ret void 905} 906 907; The above tests test the types, the below tests check that the addressing 908; modes still function 909 910define void @masked_scatter_32b_scaled_sext_f16(ptr %a, ptr %b, ptr %base) vscale_range(16,0) #0 { 911; CHECK-LABEL: masked_scatter_32b_scaled_sext_f16: 912; CHECK: // %bb.0: 913; CHECK-NEXT: ptrue p0.h, vl32 914; CHECK-NEXT: ptrue p1.s, vl32 915; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 916; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] 917; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 918; CHECK-NEXT: uunpklo z0.s, z0.h 919; CHECK-NEXT: punpklo p0.h, p0.b 920; CHECK-NEXT: st1h { z0.s }, p0, [x2, z1.s, sxtw #1] 921; CHECK-NEXT: ret 922 %vals = load <32 x half>, ptr %a 923 %idxs = load <32 x i32>, ptr %b 924 %ext = sext <32 x i32> %idxs to <32 x i64> 925 %ptrs = getelementptr half, ptr %base, <32 x i64> %ext 926 %mask = fcmp oeq <32 x half> %vals, zeroinitializer 927 call void @llvm.masked.scatter.v32f16(<32 x half> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask) 928 ret void 929} 930 931define void @masked_scatter_32b_scaled_sext_f32(ptr %a, ptr %b, ptr %base) vscale_range(16,0) #0 { 932; CHECK-LABEL: masked_scatter_32b_scaled_sext_f32: 933; CHECK: // %bb.0: 934; CHECK-NEXT: ptrue p0.s, vl32 935; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 936; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 937; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, #0.0 938; CHECK-NEXT: st1w { z0.s }, p1, [x2, z1.s, sxtw #2] 939; CHECK-NEXT: ret 940 %vals = load <32 x float>, ptr %a 941 %idxs = load <32 x i32>, ptr %b 942 %ext = sext <32 x i32> %idxs to <32 x i64> 943 %ptrs = getelementptr float, ptr %base, <32 x i64> %ext 944 %mask = fcmp oeq <32 x float> %vals, zeroinitializer 945 call void @llvm.masked.scatter.v32f32(<32 x float> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask) 946 ret void 947} 948 949define void @masked_scatter_32b_scaled_sext_f64(ptr %a, ptr %b, ptr %base) vscale_range(16,0) #0 { 950; CHECK-LABEL: masked_scatter_32b_scaled_sext_f64: 951; CHECK: // %bb.0: 952; CHECK-NEXT: ptrue p0.d, vl32 953; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 954; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x1] 955; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 956; CHECK-NEXT: st1d { z0.d }, p1, [x2, z1.d, lsl #3] 957; CHECK-NEXT: ret 958 %vals = load <32 x double>, ptr %a 959 %idxs = load <32 x i32>, ptr %b 960 %ext = sext <32 x i32> %idxs to <32 x i64> 961 %ptrs = getelementptr double, ptr %base, <32 x i64> %ext 962 %mask = fcmp oeq <32 x double> %vals, zeroinitializer 963 call void @llvm.masked.scatter.v32f64(<32 x double> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask) 964 ret void 965} 966 967define void @masked_scatter_32b_scaled_zext(ptr %a, ptr %b, ptr %base) vscale_range(16,0) #0 { 968; CHECK-LABEL: masked_scatter_32b_scaled_zext: 969; CHECK: // %bb.0: 970; CHECK-NEXT: ptrue p0.h, vl32 971; CHECK-NEXT: ptrue p1.s, vl32 972; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 973; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] 974; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 975; CHECK-NEXT: uunpklo z0.s, z0.h 976; CHECK-NEXT: punpklo p0.h, p0.b 977; CHECK-NEXT: st1h { z0.s }, p0, [x2, z1.s, uxtw #1] 978; CHECK-NEXT: ret 979 %vals = load <32 x half>, ptr %a 980 %idxs = load <32 x i32>, ptr %b 981 %ext = zext <32 x i32> %idxs to <32 x i64> 982 %ptrs = getelementptr half, ptr %base, <32 x i64> %ext 983 %mask = fcmp oeq <32 x half> %vals, zeroinitializer 984 call void @llvm.masked.scatter.v32f16(<32 x half> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask) 985 ret void 986} 987 988define void @masked_scatter_32b_unscaled_sext(ptr %a, ptr %b, ptr %base) vscale_range(16,0) #0 { 989; CHECK-LABEL: masked_scatter_32b_unscaled_sext: 990; CHECK: // %bb.0: 991; CHECK-NEXT: ptrue p0.h, vl32 992; CHECK-NEXT: ptrue p1.s, vl32 993; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 994; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] 995; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 996; CHECK-NEXT: uunpklo z0.s, z0.h 997; CHECK-NEXT: punpklo p0.h, p0.b 998; CHECK-NEXT: st1h { z0.s }, p0, [x2, z1.s, sxtw] 999; CHECK-NEXT: ret 1000 %vals = load <32 x half>, ptr %a 1001 %idxs = load <32 x i32>, ptr %b 1002 %ext = sext <32 x i32> %idxs to <32 x i64> 1003 %byte_ptrs = getelementptr i8, ptr %base, <32 x i64> %ext 1004 %ptrs = bitcast <32 x ptr> %byte_ptrs to <32 x ptr> 1005 %mask = fcmp oeq <32 x half> %vals, zeroinitializer 1006 call void @llvm.masked.scatter.v32f16(<32 x half> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask) 1007 ret void 1008} 1009 1010define void @masked_scatter_32b_unscaled_zext(ptr %a, ptr %b, ptr %base) vscale_range(16,0) #0 { 1011; CHECK-LABEL: masked_scatter_32b_unscaled_zext: 1012; CHECK: // %bb.0: 1013; CHECK-NEXT: ptrue p0.h, vl32 1014; CHECK-NEXT: ptrue p1.s, vl32 1015; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1016; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] 1017; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0 1018; CHECK-NEXT: uunpklo z0.s, z0.h 1019; CHECK-NEXT: punpklo p0.h, p0.b 1020; CHECK-NEXT: st1h { z0.s }, p0, [x2, z1.s, uxtw] 1021; CHECK-NEXT: ret 1022 %vals = load <32 x half>, ptr %a 1023 %idxs = load <32 x i32>, ptr %b 1024 %ext = zext <32 x i32> %idxs to <32 x i64> 1025 %byte_ptrs = getelementptr i8, ptr %base, <32 x i64> %ext 1026 %ptrs = bitcast <32 x ptr> %byte_ptrs to <32 x ptr> 1027 %mask = fcmp oeq <32 x half> %vals, zeroinitializer 1028 call void @llvm.masked.scatter.v32f16(<32 x half> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask) 1029 ret void 1030} 1031 1032define void @masked_scatter_64b_scaled(ptr %a, ptr %b, ptr %base) vscale_range(16,0) #0 { 1033; CHECK-LABEL: masked_scatter_64b_scaled: 1034; CHECK: // %bb.0: 1035; CHECK-NEXT: ptrue p0.s, vl32 1036; CHECK-NEXT: ptrue p1.d, vl32 1037; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1038; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] 1039; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 1040; CHECK-NEXT: uunpklo z0.d, z0.s 1041; CHECK-NEXT: punpklo p0.h, p0.b 1042; CHECK-NEXT: st1w { z0.d }, p0, [x2, z1.d, lsl #2] 1043; CHECK-NEXT: ret 1044 %vals = load <32 x float>, ptr %a 1045 %idxs = load <32 x i64>, ptr %b 1046 %ptrs = getelementptr float, ptr %base, <32 x i64> %idxs 1047 %mask = fcmp oeq <32 x float> %vals, zeroinitializer 1048 call void @llvm.masked.scatter.v32f32(<32 x float> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask) 1049 ret void 1050} 1051 1052define void @masked_scatter_64b_unscaled(ptr %a, ptr %b, ptr %base) vscale_range(16,0) #0 { 1053; CHECK-LABEL: masked_scatter_64b_unscaled: 1054; CHECK: // %bb.0: 1055; CHECK-NEXT: ptrue p0.s, vl32 1056; CHECK-NEXT: ptrue p1.d, vl32 1057; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1058; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] 1059; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 1060; CHECK-NEXT: uunpklo z0.d, z0.s 1061; CHECK-NEXT: punpklo p0.h, p0.b 1062; CHECK-NEXT: st1w { z0.d }, p0, [x2, z1.d] 1063; CHECK-NEXT: ret 1064 %vals = load <32 x float>, ptr %a 1065 %idxs = load <32 x i64>, ptr %b 1066 %byte_ptrs = getelementptr i8, ptr %base, <32 x i64> %idxs 1067 %ptrs = bitcast <32 x ptr> %byte_ptrs to <32 x ptr> 1068 %mask = fcmp oeq <32 x float> %vals, zeroinitializer 1069 call void @llvm.masked.scatter.v32f32(<32 x float> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask) 1070 ret void 1071} 1072 1073define void @masked_scatter_vec_plus_reg(ptr %a, ptr %b, i64 %off) vscale_range(16,0) #0 { 1074; CHECK-LABEL: masked_scatter_vec_plus_reg: 1075; CHECK: // %bb.0: 1076; CHECK-NEXT: ptrue p0.s, vl32 1077; CHECK-NEXT: ptrue p1.d, vl32 1078; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1079; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] 1080; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 1081; CHECK-NEXT: uunpklo z0.d, z0.s 1082; CHECK-NEXT: punpklo p0.h, p0.b 1083; CHECK-NEXT: st1w { z0.d }, p0, [x2, z1.d] 1084; CHECK-NEXT: ret 1085 %vals = load <32 x float>, ptr %a 1086 %bases = load <32 x ptr>, ptr %b 1087 %byte_ptrs = getelementptr i8, <32 x ptr> %bases, i64 %off 1088 %ptrs = bitcast <32 x ptr> %byte_ptrs to <32 x ptr> 1089 %mask = fcmp oeq <32 x float> %vals, zeroinitializer 1090 call void @llvm.masked.scatter.v32f32(<32 x float> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask) 1091 ret void 1092} 1093 1094define void @masked_scatter_vec_plus_imm(ptr %a, ptr %b) vscale_range(16,0) #0 { 1095; CHECK-LABEL: masked_scatter_vec_plus_imm: 1096; CHECK: // %bb.0: 1097; CHECK-NEXT: ptrue p0.s, vl32 1098; CHECK-NEXT: ptrue p1.d, vl32 1099; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1100; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] 1101; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 1102; CHECK-NEXT: uunpklo z0.d, z0.s 1103; CHECK-NEXT: punpklo p0.h, p0.b 1104; CHECK-NEXT: st1w { z0.d }, p0, [z1.d, #4] 1105; CHECK-NEXT: ret 1106 %vals = load <32 x float>, ptr %a 1107 %bases = load <32 x ptr>, ptr %b 1108 %byte_ptrs = getelementptr i8, <32 x ptr> %bases, i64 4 1109 %ptrs = bitcast <32 x ptr> %byte_ptrs to <32 x ptr> 1110 %mask = fcmp oeq <32 x float> %vals, zeroinitializer 1111 call void @llvm.masked.scatter.v32f32(<32 x float> %vals, <32 x ptr> %ptrs, i32 8, <32 x i1> %mask) 1112 ret void 1113} 1114 1115; extract_subvec(...(insert_subvec(a,b,c))) -> extract_subvec(bitcast(b),d) like 1116; combines can effectively unlegalise bitcast operations. This test ensures such 1117; combines do not happen after operation legalisation. When not prevented the 1118; test triggers infinite combine->legalise->combine->... 1119; 1120; NOTE: For this test to function correctly it's critical for %vals to be in a 1121; different block to the scatter store. If not, the problematic bitcast will be 1122; removed before operation legalisation and thus not exercise the combine. 1123define void @masked_scatter_bitcast_infinite_loop(ptr %a, ptr %b, i1 %cond) vscale_range(4,0) #0 { 1124; CHECK-LABEL: masked_scatter_bitcast_infinite_loop: 1125; CHECK: // %bb.0: 1126; CHECK-NEXT: ptrue p0.d, vl8 1127; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1128; CHECK-NEXT: tbz w2, #0, .LBB47_2 1129; CHECK-NEXT: // %bb.1: // %bb.1 1130; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, #0.0 1131; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 1132; CHECK-NEXT: st1d { z0.d }, p1, [z1.d] 1133; CHECK-NEXT: .LBB47_2: // %bb.2 1134; CHECK-NEXT: ret 1135 %vals = load volatile <8 x double>, ptr %a 1136 br i1 %cond, label %bb.1, label %bb.2 1137 1138bb.1: 1139 %ptrs = load <8 x ptr>, ptr %b 1140 %mask = fcmp oeq <8 x double> %vals, zeroinitializer 1141 call void @llvm.masked.scatter.v8f64(<8 x double> %vals, <8 x ptr> %ptrs, i32 8, <8 x i1> %mask) 1142 br label %bb.2 1143 1144bb.2: 1145 ret void 1146} 1147 1148declare void @llvm.masked.scatter.v2i8(<2 x i8>, <2 x ptr>, i32, <2 x i1>) 1149declare void @llvm.masked.scatter.v4i8(<4 x i8>, <4 x ptr>, i32, <4 x i1>) 1150declare void @llvm.masked.scatter.v8i8(<8 x i8>, <8 x ptr>, i32, <8 x i1>) 1151declare void @llvm.masked.scatter.v16i8(<16 x i8>, <16 x ptr>, i32, <16 x i1>) 1152declare void @llvm.masked.scatter.v32i8(<32 x i8>, <32 x ptr>, i32, <32 x i1>) 1153 1154declare void @llvm.masked.scatter.v2i16(<2 x i16>, <2 x ptr>, i32, <2 x i1>) 1155declare void @llvm.masked.scatter.v4i16(<4 x i16>, <4 x ptr>, i32, <4 x i1>) 1156declare void @llvm.masked.scatter.v8i16(<8 x i16>, <8 x ptr>, i32, <8 x i1>) 1157declare void @llvm.masked.scatter.v16i16(<16 x i16>, <16 x ptr>, i32, <16 x i1>) 1158declare void @llvm.masked.scatter.v32i16(<32 x i16>, <32 x ptr>, i32, <32 x i1>) 1159 1160declare void @llvm.masked.scatter.v2i32(<2 x i32>, <2 x ptr>, i32, <2 x i1>) 1161declare void @llvm.masked.scatter.v4i32(<4 x i32>, <4 x ptr>, i32, <4 x i1>) 1162declare void @llvm.masked.scatter.v8i32(<8 x i32>, <8 x ptr>, i32, <8 x i1>) 1163declare void @llvm.masked.scatter.v16i32(<16 x i32>, <16 x ptr>, i32, <16 x i1>) 1164declare void @llvm.masked.scatter.v32i32(<32 x i32>, <32 x ptr>, i32, <32 x i1>) 1165 1166declare void @llvm.masked.scatter.v1i64(<1 x i64>, <1 x ptr>, i32, <1 x i1>) 1167declare void @llvm.masked.scatter.v2i64(<2 x i64>, <2 x ptr>, i32, <2 x i1>) 1168declare void @llvm.masked.scatter.v4i64(<4 x i64>, <4 x ptr>, i32, <4 x i1>) 1169declare void @llvm.masked.scatter.v8i64(<8 x i64>, <8 x ptr>, i32, <8 x i1>) 1170declare void @llvm.masked.scatter.v16i64(<16 x i64>, <16 x ptr>, i32, <16 x i1>) 1171declare void @llvm.masked.scatter.v32i64(<32 x i64>, <32 x ptr>, i32, <32 x i1>) 1172 1173declare void @llvm.masked.scatter.v2f16(<2 x half>, <2 x ptr>, i32, <2 x i1>) 1174declare void @llvm.masked.scatter.v4f16(<4 x half>, <4 x ptr>, i32, <4 x i1>) 1175declare void @llvm.masked.scatter.v8f16(<8 x half>, <8 x ptr>, i32, <8 x i1>) 1176declare void @llvm.masked.scatter.v16f16(<16 x half>, <16 x ptr>, i32, <16 x i1>) 1177declare void @llvm.masked.scatter.v32f16(<32 x half>, <32 x ptr>, i32, <32 x i1>) 1178 1179declare void @llvm.masked.scatter.v2f32(<2 x float>, <2 x ptr>, i32, <2 x i1>) 1180declare void @llvm.masked.scatter.v4f32(<4 x float>, <4 x ptr>, i32, <4 x i1>) 1181declare void @llvm.masked.scatter.v8f32(<8 x float>, <8 x ptr>, i32, <8 x i1>) 1182declare void @llvm.masked.scatter.v16f32(<16 x float>, <16 x ptr>, i32, <16 x i1>) 1183declare void @llvm.masked.scatter.v32f32(<32 x float>, <32 x ptr>, i32, <32 x i1>) 1184 1185declare void @llvm.masked.scatter.v1f64(<1 x double>, <1 x ptr>, i32, <1 x i1>) 1186declare void @llvm.masked.scatter.v2f64(<2 x double>, <2 x ptr>, i32, <2 x i1>) 1187declare void @llvm.masked.scatter.v4f64(<4 x double>, <4 x ptr>, i32, <4 x i1>) 1188declare void @llvm.masked.scatter.v8f64(<8 x double>, <8 x ptr>, i32, <8 x i1>) 1189declare void @llvm.masked.scatter.v16f64(<16 x double>, <16 x ptr>, i32, <16 x i1>) 1190declare void @llvm.masked.scatter.v32f64(<32 x double>, <32 x ptr>, i32, <32 x i1>) 1191 1192attributes #0 = { "target-features"="+sve" } 1193