1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 3; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 5 6target triple = "aarch64-unknown-linux-gnu" 7 8; 9; Masked Loads 10; 11 12define <2 x half> @masked_load_v2f16(ptr %ap, ptr %bp) vscale_range(2,0) #0 { 13; CHECK-LABEL: masked_load_v2f16: 14; CHECK: // %bb.0: 15; CHECK-NEXT: ldr s1, [x0] 16; CHECK-NEXT: ldr s2, [x1] 17; CHECK-NEXT: movi v0.2d, #0000000000000000 18; CHECK-NEXT: ptrue p0.h, vl4 19; CHECK-NEXT: fcmeq v1.4h, v1.4h, v2.4h 20; CHECK-NEXT: sshll v1.4s, v1.4h, #0 21; CHECK-NEXT: mov v0.h[0], v1.h[0] 22; CHECK-NEXT: mov w8, v1.s[1] 23; CHECK-NEXT: mov v0.h[1], w8 24; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 25; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 26; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 27; CHECK-NEXT: ret 28 %a = load <2 x half>, ptr %ap 29 %b = load <2 x half>, ptr %bp 30 %mask = fcmp oeq <2 x half> %a, %b 31 %load = call <2 x half> @llvm.masked.load.v2f16(ptr %ap, i32 8, <2 x i1> %mask, <2 x half> zeroinitializer) 32 ret <2 x half> %load 33} 34 35define <2 x float> @masked_load_v2f32(ptr %ap, ptr %bp) vscale_range(2,0) #0 { 36; CHECK-LABEL: masked_load_v2f32: 37; CHECK: // %bb.0: 38; CHECK-NEXT: ldr d0, [x0] 39; CHECK-NEXT: ldr d1, [x1] 40; CHECK-NEXT: ptrue p0.s, vl2 41; CHECK-NEXT: fcmeq v0.2s, v0.2s, v1.2s 42; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 43; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 44; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 45; CHECK-NEXT: ret 46 %a = load <2 x float>, ptr %ap 47 %b = load <2 x float>, ptr %bp 48 %mask = fcmp oeq <2 x float> %a, %b 49 %load = call <2 x float> @llvm.masked.load.v2f32(ptr %ap, i32 8, <2 x i1> %mask, <2 x float> zeroinitializer) 50 ret <2 x float> %load 51} 52 53define <4 x float> @masked_load_v4f32(ptr %ap, ptr %bp) vscale_range(1,0) #0 { 54; CHECK-LABEL: masked_load_v4f32: 55; CHECK: // %bb.0: 56; CHECK-NEXT: ldr q0, [x0] 57; CHECK-NEXT: ldr q1, [x1] 58; CHECK-NEXT: ptrue p0.s, vl4 59; CHECK-NEXT: fcmeq v0.4s, v0.4s, v1.4s 60; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 61; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 62; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 63; CHECK-NEXT: ret 64 %a = load <4 x float>, ptr %ap 65 %b = load <4 x float>, ptr %bp 66 %mask = fcmp oeq <4 x float> %a, %b 67 %load = call <4 x float> @llvm.masked.load.v4f32(ptr %ap, i32 8, <4 x i1> %mask, <4 x float> zeroinitializer) 68 ret <4 x float> %load 69} 70 71define void @masked_load_v8f32(ptr %ap, ptr %bp, ptr %c) vscale_range(2,0) #0 { 72; CHECK-LABEL: masked_load_v8f32: 73; CHECK: // %bb.0: 74; CHECK-NEXT: ptrue p0.s, vl8 75; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 76; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 77; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s 78; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] 79; CHECK-NEXT: st1w { z0.s }, p0, [x2] 80; CHECK-NEXT: ret 81 %a = load <8 x float>, ptr %ap 82 %b = load <8 x float>, ptr %bp 83 %mask = fcmp oeq <8 x float> %a, %b 84 %load = call <8 x float> @llvm.masked.load.v8f32(ptr %ap, i32 8, <8 x i1> %mask, <8 x float> zeroinitializer) 85 store <8 x float> %load, ptr %c 86 ret void 87} 88 89define void @masked_load_v16f32(ptr %ap, ptr %bp, ptr %c) #0 { 90; VBITS_GE_256-LABEL: masked_load_v16f32: 91; VBITS_GE_256: // %bb.0: 92; VBITS_GE_256-NEXT: ptrue p0.s, vl8 93; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 94; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 95; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] 96; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] 97; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] 98; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s 99; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z2.s, z3.s 100; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2] 101; VBITS_GE_256-NEXT: ld1w { z1.s }, p2/z, [x0] 102; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] 103; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2] 104; VBITS_GE_256-NEXT: ret 105; 106; VBITS_GE_512-LABEL: masked_load_v16f32: 107; VBITS_GE_512: // %bb.0: 108; VBITS_GE_512-NEXT: ptrue p0.s, vl16 109; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 110; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] 111; VBITS_GE_512-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s 112; VBITS_GE_512-NEXT: ld1w { z0.s }, p1/z, [x0] 113; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] 114; VBITS_GE_512-NEXT: ret 115 %a = load <16 x float>, ptr %ap 116 %b = load <16 x float>, ptr %bp 117 %mask = fcmp oeq <16 x float> %a, %b 118 %load = call <16 x float> @llvm.masked.load.v16f32(ptr %ap, i32 8, <16 x i1> %mask, <16 x float> zeroinitializer) 119 store <16 x float> %load, ptr %c 120 ret void 121} 122 123define void @masked_load_v32f32(ptr %ap, ptr %bp, ptr %c) vscale_range(8,0) #0 { 124; CHECK-LABEL: masked_load_v32f32: 125; CHECK: // %bb.0: 126; CHECK-NEXT: ptrue p0.s, vl32 127; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 128; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 129; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s 130; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] 131; CHECK-NEXT: st1w { z0.s }, p0, [x2] 132; CHECK-NEXT: ret 133 %a = load <32 x float>, ptr %ap 134 %b = load <32 x float>, ptr %bp 135 %mask = fcmp oeq <32 x float> %a, %b 136 %load = call <32 x float> @llvm.masked.load.v32f32(ptr %ap, i32 8, <32 x i1> %mask, <32 x float> zeroinitializer) 137 store <32 x float> %load, ptr %c 138 ret void 139} 140 141define void @masked_load_v64f32(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 { 142; CHECK-LABEL: masked_load_v64f32: 143; CHECK: // %bb.0: 144; CHECK-NEXT: ptrue p0.s, vl64 145; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 146; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 147; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s 148; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] 149; CHECK-NEXT: st1w { z0.s }, p0, [x2] 150; CHECK-NEXT: ret 151 %a = load <64 x float>, ptr %ap 152 %b = load <64 x float>, ptr %bp 153 %mask = fcmp oeq <64 x float> %a, %b 154 %load = call <64 x float> @llvm.masked.load.v64f32(ptr %ap, i32 8, <64 x i1> %mask, <64 x float> zeroinitializer) 155 store <64 x float> %load, ptr %c 156 ret void 157} 158 159define void @masked_load_v64i8(ptr %ap, ptr %bp, ptr %c) #0 { 160; VBITS_GE_256-LABEL: masked_load_v64i8: 161; VBITS_GE_256: // %bb.0: 162; VBITS_GE_256-NEXT: ptrue p0.b, vl32 163; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 164; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] 165; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1, x8] 166; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x0] 167; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1] 168; VBITS_GE_256-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b 169; VBITS_GE_256-NEXT: cmpeq p2.b, p0/z, z2.b, z3.b 170; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0, x8] 171; VBITS_GE_256-NEXT: ld1b { z1.b }, p2/z, [x0] 172; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x2, x8] 173; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x2] 174; VBITS_GE_256-NEXT: ret 175; 176; VBITS_GE_512-LABEL: masked_load_v64i8: 177; VBITS_GE_512: // %bb.0: 178; VBITS_GE_512-NEXT: ptrue p0.b, vl64 179; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] 180; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] 181; VBITS_GE_512-NEXT: cmpeq p1.b, p0/z, z0.b, z1.b 182; VBITS_GE_512-NEXT: ld1b { z0.b }, p1/z, [x0] 183; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x2] 184; VBITS_GE_512-NEXT: ret 185 %a = load <64 x i8>, ptr %ap 186 %b = load <64 x i8>, ptr %bp 187 %mask = icmp eq <64 x i8> %a, %b 188 %load = call <64 x i8> @llvm.masked.load.v64i8(ptr %ap, i32 8, <64 x i1> %mask, <64 x i8> undef) 189 store <64 x i8> %load, ptr %c 190 ret void 191} 192 193define void @masked_load_v32i16(ptr %ap, ptr %bp, ptr %c) #0 { 194; VBITS_GE_256-LABEL: masked_load_v32i16: 195; VBITS_GE_256: // %bb.0: 196; VBITS_GE_256-NEXT: ptrue p0.h, vl16 197; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 198; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 199; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] 200; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] 201; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] 202; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h 203; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z2.h, z3.h 204; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1] 205; VBITS_GE_256-NEXT: ld1h { z1.h }, p2/z, [x0] 206; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] 207; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2] 208; VBITS_GE_256-NEXT: ret 209; 210; VBITS_GE_512-LABEL: masked_load_v32i16: 211; VBITS_GE_512: // %bb.0: 212; VBITS_GE_512-NEXT: ptrue p0.h, vl32 213; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 214; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] 215; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, z1.h 216; VBITS_GE_512-NEXT: ld1h { z0.h }, p1/z, [x0] 217; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2] 218; VBITS_GE_512-NEXT: ret 219 %a = load <32 x i16>, ptr %ap 220 %b = load <32 x i16>, ptr %bp 221 %mask = icmp eq <32 x i16> %a, %b 222 %load = call <32 x i16> @llvm.masked.load.v32i16(ptr %ap, i32 8, <32 x i1> %mask, <32 x i16> undef) 223 store <32 x i16> %load, ptr %c 224 ret void 225} 226 227define void @masked_load_v16i32(ptr %ap, ptr %bp, ptr %c) #0 { 228; VBITS_GE_256-LABEL: masked_load_v16i32: 229; VBITS_GE_256: // %bb.0: 230; VBITS_GE_256-NEXT: ptrue p0.s, vl8 231; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 232; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 233; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] 234; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] 235; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] 236; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s 237; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z2.s, z3.s 238; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2] 239; VBITS_GE_256-NEXT: ld1w { z1.s }, p2/z, [x0] 240; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] 241; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2] 242; VBITS_GE_256-NEXT: ret 243; 244; VBITS_GE_512-LABEL: masked_load_v16i32: 245; VBITS_GE_512: // %bb.0: 246; VBITS_GE_512-NEXT: ptrue p0.s, vl16 247; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 248; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] 249; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, z1.s 250; VBITS_GE_512-NEXT: ld1w { z0.s }, p1/z, [x0] 251; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] 252; VBITS_GE_512-NEXT: ret 253 %a = load <16 x i32>, ptr %ap 254 %b = load <16 x i32>, ptr %bp 255 %mask = icmp eq <16 x i32> %a, %b 256 %load = call <16 x i32> @llvm.masked.load.v16i32(ptr %ap, i32 8, <16 x i1> %mask, <16 x i32> undef) 257 store <16 x i32> %load, ptr %c 258 ret void 259} 260 261define void @masked_load_v8i64(ptr %ap, ptr %bp, ptr %c) #0 { 262; VBITS_GE_256-LABEL: masked_load_v8i64: 263; VBITS_GE_256: // %bb.0: 264; VBITS_GE_256-NEXT: ptrue p0.d, vl4 265; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 266; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 267; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] 268; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] 269; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] 270; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d 271; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z2.d, z3.d 272; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3] 273; VBITS_GE_256-NEXT: ld1d { z1.d }, p2/z, [x0] 274; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] 275; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2] 276; VBITS_GE_256-NEXT: ret 277; 278; VBITS_GE_512-LABEL: masked_load_v8i64: 279; VBITS_GE_512: // %bb.0: 280; VBITS_GE_512-NEXT: ptrue p0.d, vl8 281; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 282; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] 283; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d 284; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [x0] 285; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] 286; VBITS_GE_512-NEXT: ret 287 %a = load <8 x i64>, ptr %ap 288 %b = load <8 x i64>, ptr %bp 289 %mask = icmp eq <8 x i64> %a, %b 290 %load = call <8 x i64> @llvm.masked.load.v8i64(ptr %ap, i32 8, <8 x i1> %mask, <8 x i64> undef) 291 store <8 x i64> %load, ptr %c 292 ret void 293} 294 295define void @masked_load_passthru_v8i64(ptr %ap, ptr %bp, ptr %c) #0 { 296; VBITS_GE_256-LABEL: masked_load_passthru_v8i64: 297; VBITS_GE_256: // %bb.0: 298; VBITS_GE_256-NEXT: ptrue p0.d, vl4 299; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 300; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 301; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] 302; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] 303; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] 304; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d 305; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z2.d, z3.d 306; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3] 307; VBITS_GE_256-NEXT: ld1d { z2.d }, p2/z, [x0] 308; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z1.d 309; VBITS_GE_256-NEXT: sel z1.d, p2, z2.d, z3.d 310; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] 311; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2] 312; VBITS_GE_256-NEXT: ret 313; 314; VBITS_GE_512-LABEL: masked_load_passthru_v8i64: 315; VBITS_GE_512: // %bb.0: 316; VBITS_GE_512-NEXT: ptrue p0.d, vl8 317; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 318; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] 319; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, z1.d 320; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [x0] 321; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d 322; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] 323; VBITS_GE_512-NEXT: ret 324 %a = load <8 x i64>, ptr %ap 325 %b = load <8 x i64>, ptr %bp 326 %mask = icmp eq <8 x i64> %a, %b 327 %load = call <8 x i64> @llvm.masked.load.v8i64(ptr %ap, i32 8, <8 x i1> %mask, <8 x i64> %b) 328 store <8 x i64> %load, ptr %c 329 ret void 330} 331 332define void @masked_load_passthru_v8f64(ptr %ap, ptr %bp, ptr %c) #0 { 333; VBITS_GE_256-LABEL: masked_load_passthru_v8f64: 334; VBITS_GE_256: // %bb.0: 335; VBITS_GE_256-NEXT: ptrue p0.d, vl4 336; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 337; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 338; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] 339; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] 340; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] 341; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d 342; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z2.d, z3.d 343; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3] 344; VBITS_GE_256-NEXT: ld1d { z2.d }, p2/z, [x0] 345; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z1.d 346; VBITS_GE_256-NEXT: sel z1.d, p2, z2.d, z3.d 347; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] 348; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2] 349; VBITS_GE_256-NEXT: ret 350; 351; VBITS_GE_512-LABEL: masked_load_passthru_v8f64: 352; VBITS_GE_512: // %bb.0: 353; VBITS_GE_512-NEXT: ptrue p0.d, vl8 354; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 355; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] 356; VBITS_GE_512-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d 357; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [x0] 358; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d 359; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] 360; VBITS_GE_512-NEXT: ret 361 %a = load <8 x double>, ptr %ap 362 %b = load <8 x double>, ptr %bp 363 %mask = fcmp oeq <8 x double> %a, %b 364 %load = call <8 x double> @llvm.masked.load.v8f64(ptr %ap, i32 8, <8 x i1> %mask, <8 x double> %b) 365 store <8 x double> %load, ptr %c 366 ret void 367} 368 369define void @masked_load_sext_v32i8i16(ptr %ap, ptr %bp, ptr %c) #0 { 370; VBITS_GE_256-LABEL: masked_load_sext_v32i8i16: 371; VBITS_GE_256: // %bb.0: 372; VBITS_GE_256-NEXT: ptrue p0.b, vl32 373; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 374; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x1] 375; VBITS_GE_256-NEXT: cmpeq p0.b, p0/z, z0.b, #0 376; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] 377; VBITS_GE_256-NEXT: ptrue p0.h, vl16 378; VBITS_GE_256-NEXT: sunpklo z1.h, z0.b 379; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 380; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b 381; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2] 382; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] 383; VBITS_GE_256-NEXT: ret 384; 385; VBITS_GE_512-LABEL: masked_load_sext_v32i8i16: 386; VBITS_GE_512: // %bb.0: 387; VBITS_GE_512-NEXT: ptrue p0.h, vl32 388; VBITS_GE_512-NEXT: ld1b { z0.h }, p0/z, [x1] 389; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, #0 390; VBITS_GE_512-NEXT: ld1sb { z0.h }, p1/z, [x0] 391; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2] 392; VBITS_GE_512-NEXT: ret 393 %b = load <32 x i8>, ptr %bp 394 %mask = icmp eq <32 x i8> %b, zeroinitializer 395 %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef) 396 %ext = sext <32 x i8> %load to <32 x i16> 397 store <32 x i16> %ext, ptr %c 398 ret void 399} 400 401define void @masked_load_sext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 { 402; VBITS_GE_256-LABEL: masked_load_sext_v16i8i32: 403; VBITS_GE_256: // %bb.0: 404; VBITS_GE_256-NEXT: ldr q0, [x1] 405; VBITS_GE_256-NEXT: ptrue p0.b, vl16 406; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 407; VBITS_GE_256-NEXT: cmeq v0.16b, v0.16b, #0 408; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0 409; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] 410; VBITS_GE_256-NEXT: ptrue p0.s, vl8 411; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 412; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b 413; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b 414; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h 415; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h 416; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2] 417; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x8, lsl #2] 418; VBITS_GE_256-NEXT: ret 419; 420; VBITS_GE_512-LABEL: masked_load_sext_v16i8i32: 421; VBITS_GE_512: // %bb.0: 422; VBITS_GE_512-NEXT: ptrue p0.s, vl16 423; VBITS_GE_512-NEXT: ld1b { z0.s }, p0/z, [x1] 424; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 425; VBITS_GE_512-NEXT: ld1sb { z0.s }, p1/z, [x0] 426; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] 427; VBITS_GE_512-NEXT: ret 428 %b = load <16 x i8>, ptr %bp 429 %mask = icmp eq <16 x i8> %b, zeroinitializer 430 %load = call <16 x i8> @llvm.masked.load.v16i8(ptr %ap, i32 8, <16 x i1> %mask, <16 x i8> undef) 431 %ext = sext <16 x i8> %load to <16 x i32> 432 store <16 x i32> %ext, ptr %c 433 ret void 434} 435 436define void @masked_load_sext_v8i8i64(ptr %ap, ptr %bp, ptr %c) #0 { 437; VBITS_GE_256-LABEL: masked_load_sext_v8i8i64: 438; VBITS_GE_256: // %bb.0: 439; VBITS_GE_256-NEXT: ldr d0, [x1] 440; VBITS_GE_256-NEXT: ptrue p0.b, vl8 441; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 442; VBITS_GE_256-NEXT: cmeq v0.8b, v0.8b, #0 443; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0 444; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] 445; VBITS_GE_256-NEXT: ptrue p0.d, vl4 446; VBITS_GE_256-NEXT: sshll v0.8h, v0.8b, #0 447; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 448; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h 449; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h 450; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s 451; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s 452; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2] 453; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3] 454; VBITS_GE_256-NEXT: ret 455; 456; VBITS_GE_512-LABEL: masked_load_sext_v8i8i64: 457; VBITS_GE_512: // %bb.0: 458; VBITS_GE_512-NEXT: ptrue p0.d, vl8 459; VBITS_GE_512-NEXT: ld1b { z0.d }, p0/z, [x1] 460; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 461; VBITS_GE_512-NEXT: ld1sb { z0.d }, p1/z, [x0] 462; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] 463; VBITS_GE_512-NEXT: ret 464 %b = load <8 x i8>, ptr %bp 465 %mask = icmp eq <8 x i8> %b, zeroinitializer 466 %load = call <8 x i8> @llvm.masked.load.v8i8(ptr %ap, i32 8, <8 x i1> %mask, <8 x i8> undef) 467 %ext = sext <8 x i8> %load to <8 x i64> 468 store <8 x i64> %ext, ptr %c 469 ret void 470} 471 472define void @masked_load_sext_v16i16i32(ptr %ap, ptr %bp, ptr %c) #0 { 473; VBITS_GE_256-LABEL: masked_load_sext_v16i16i32: 474; VBITS_GE_256: // %bb.0: 475; VBITS_GE_256-NEXT: ptrue p0.h, vl16 476; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 477; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1] 478; VBITS_GE_256-NEXT: cmpeq p0.h, p0/z, z0.h, #0 479; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] 480; VBITS_GE_256-NEXT: ptrue p0.s, vl8 481; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h 482; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 483; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h 484; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2] 485; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] 486; VBITS_GE_256-NEXT: ret 487; 488; VBITS_GE_512-LABEL: masked_load_sext_v16i16i32: 489; VBITS_GE_512: // %bb.0: 490; VBITS_GE_512-NEXT: ptrue p0.s, vl16 491; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x1] 492; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 493; VBITS_GE_512-NEXT: ld1sh { z0.s }, p1/z, [x0] 494; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] 495; VBITS_GE_512-NEXT: ret 496 %b = load <16 x i16>, ptr %bp 497 %mask = icmp eq <16 x i16> %b, zeroinitializer 498 %load = call <16 x i16> @llvm.masked.load.v16i16(ptr %ap, i32 8, <16 x i1> %mask, <16 x i16> undef) 499 %ext = sext <16 x i16> %load to <16 x i32> 500 store <16 x i32> %ext, ptr %c 501 ret void 502} 503 504define void @masked_load_sext_v8i16i64(ptr %ap, ptr %bp, ptr %c) #0 { 505; VBITS_GE_256-LABEL: masked_load_sext_v8i16i64: 506; VBITS_GE_256: // %bb.0: 507; VBITS_GE_256-NEXT: ldr q0, [x1] 508; VBITS_GE_256-NEXT: ptrue p0.h, vl8 509; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 510; VBITS_GE_256-NEXT: cmeq v0.8h, v0.8h, #0 511; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z0.h, #0 512; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] 513; VBITS_GE_256-NEXT: ptrue p0.d, vl4 514; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 515; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h 516; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h 517; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s 518; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s 519; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2] 520; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3] 521; VBITS_GE_256-NEXT: ret 522; 523; VBITS_GE_512-LABEL: masked_load_sext_v8i16i64: 524; VBITS_GE_512: // %bb.0: 525; VBITS_GE_512-NEXT: ptrue p0.d, vl8 526; VBITS_GE_512-NEXT: ld1h { z0.d }, p0/z, [x1] 527; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 528; VBITS_GE_512-NEXT: ld1sh { z0.d }, p1/z, [x0] 529; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] 530; VBITS_GE_512-NEXT: ret 531 %b = load <8 x i16>, ptr %bp 532 %mask = icmp eq <8 x i16> %b, zeroinitializer 533 %load = call <8 x i16> @llvm.masked.load.v8i16(ptr %ap, i32 8, <8 x i1> %mask, <8 x i16> undef) 534 %ext = sext <8 x i16> %load to <8 x i64> 535 store <8 x i64> %ext, ptr %c 536 ret void 537} 538 539define void @masked_load_sext_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 { 540; VBITS_GE_256-LABEL: masked_load_sext_v8i32i64: 541; VBITS_GE_256: // %bb.0: 542; VBITS_GE_256-NEXT: ptrue p0.s, vl8 543; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 544; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1] 545; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0 546; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] 547; VBITS_GE_256-NEXT: ptrue p0.d, vl4 548; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s 549; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 550; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s 551; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2] 552; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] 553; VBITS_GE_256-NEXT: ret 554; 555; VBITS_GE_512-LABEL: masked_load_sext_v8i32i64: 556; VBITS_GE_512: // %bb.0: 557; VBITS_GE_512-NEXT: ptrue p0.d, vl8 558; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x1] 559; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 560; VBITS_GE_512-NEXT: ld1sw { z0.d }, p1/z, [x0] 561; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] 562; VBITS_GE_512-NEXT: ret 563 %b = load <8 x i32>, ptr %bp 564 %mask = icmp eq <8 x i32> %b, zeroinitializer 565 %load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef) 566 %ext = sext <8 x i32> %load to <8 x i64> 567 store <8 x i64> %ext, ptr %c 568 ret void 569} 570 571define void @masked_load_zext_v32i8i16(ptr %ap, ptr %bp, ptr %c) #0 { 572; VBITS_GE_256-LABEL: masked_load_zext_v32i8i16: 573; VBITS_GE_256: // %bb.0: 574; VBITS_GE_256-NEXT: ptrue p0.b, vl32 575; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 576; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x1] 577; VBITS_GE_256-NEXT: cmpeq p0.b, p0/z, z0.b, #0 578; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] 579; VBITS_GE_256-NEXT: ptrue p0.h, vl16 580; VBITS_GE_256-NEXT: uunpklo z1.h, z0.b 581; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 582; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b 583; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2] 584; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] 585; VBITS_GE_256-NEXT: ret 586; 587; VBITS_GE_512-LABEL: masked_load_zext_v32i8i16: 588; VBITS_GE_512: // %bb.0: 589; VBITS_GE_512-NEXT: ptrue p0.h, vl32 590; VBITS_GE_512-NEXT: ld1b { z0.h }, p0/z, [x1] 591; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, #0 592; VBITS_GE_512-NEXT: ld1b { z0.h }, p1/z, [x0] 593; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2] 594; VBITS_GE_512-NEXT: ret 595 %b = load <32 x i8>, ptr %bp 596 %mask = icmp eq <32 x i8> %b, zeroinitializer 597 %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef) 598 %ext = zext <32 x i8> %load to <32 x i16> 599 store <32 x i16> %ext, ptr %c 600 ret void 601} 602 603define void @masked_load_zext_v16i8i32(ptr %ap, ptr %bp, ptr %c) #0 { 604; VBITS_GE_256-LABEL: masked_load_zext_v16i8i32: 605; VBITS_GE_256: // %bb.0: 606; VBITS_GE_256-NEXT: ldr q0, [x1] 607; VBITS_GE_256-NEXT: ptrue p0.b, vl16 608; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 609; VBITS_GE_256-NEXT: cmeq v0.16b, v0.16b, #0 610; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0 611; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] 612; VBITS_GE_256-NEXT: ptrue p0.s, vl8 613; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 614; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b 615; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b 616; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h 617; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h 618; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2] 619; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x8, lsl #2] 620; VBITS_GE_256-NEXT: ret 621; 622; VBITS_GE_512-LABEL: masked_load_zext_v16i8i32: 623; VBITS_GE_512: // %bb.0: 624; VBITS_GE_512-NEXT: ptrue p0.s, vl16 625; VBITS_GE_512-NEXT: ld1b { z0.s }, p0/z, [x1] 626; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 627; VBITS_GE_512-NEXT: ld1b { z0.s }, p1/z, [x0] 628; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] 629; VBITS_GE_512-NEXT: ret 630 %b = load <16 x i8>, ptr %bp 631 %mask = icmp eq <16 x i8> %b, zeroinitializer 632 %load = call <16 x i8> @llvm.masked.load.v16i8(ptr %ap, i32 8, <16 x i1> %mask, <16 x i8> undef) 633 %ext = zext <16 x i8> %load to <16 x i32> 634 store <16 x i32> %ext, ptr %c 635 ret void 636} 637 638define void @masked_load_zext_v8i8i64(ptr %ap, ptr %bp, ptr %c) #0 { 639; VBITS_GE_256-LABEL: masked_load_zext_v8i8i64: 640; VBITS_GE_256: // %bb.0: 641; VBITS_GE_256-NEXT: ldr d0, [x1] 642; VBITS_GE_256-NEXT: ptrue p0.b, vl8 643; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 644; VBITS_GE_256-NEXT: cmeq v0.8b, v0.8b, #0 645; VBITS_GE_256-NEXT: cmpne p0.b, p0/z, z0.b, #0 646; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] 647; VBITS_GE_256-NEXT: ptrue p0.d, vl4 648; VBITS_GE_256-NEXT: ushll v0.8h, v0.8b, #0 649; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 650; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h 651; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h 652; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s 653; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s 654; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2] 655; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3] 656; VBITS_GE_256-NEXT: ret 657; 658; VBITS_GE_512-LABEL: masked_load_zext_v8i8i64: 659; VBITS_GE_512: // %bb.0: 660; VBITS_GE_512-NEXT: ptrue p0.d, vl8 661; VBITS_GE_512-NEXT: ld1b { z0.d }, p0/z, [x1] 662; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 663; VBITS_GE_512-NEXT: ld1b { z0.d }, p1/z, [x0] 664; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] 665; VBITS_GE_512-NEXT: ret 666 %b = load <8 x i8>, ptr %bp 667 %mask = icmp eq <8 x i8> %b, zeroinitializer 668 %load = call <8 x i8> @llvm.masked.load.v8i8(ptr %ap, i32 8, <8 x i1> %mask, <8 x i8> undef) 669 %ext = zext <8 x i8> %load to <8 x i64> 670 store <8 x i64> %ext, ptr %c 671 ret void 672} 673 674define void @masked_load_zext_v16i16i32(ptr %ap, ptr %bp, ptr %c) #0 { 675; VBITS_GE_256-LABEL: masked_load_zext_v16i16i32: 676; VBITS_GE_256: // %bb.0: 677; VBITS_GE_256-NEXT: ptrue p0.h, vl16 678; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 679; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1] 680; VBITS_GE_256-NEXT: cmpeq p0.h, p0/z, z0.h, #0 681; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] 682; VBITS_GE_256-NEXT: ptrue p0.s, vl8 683; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h 684; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 685; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h 686; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2] 687; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] 688; VBITS_GE_256-NEXT: ret 689; 690; VBITS_GE_512-LABEL: masked_load_zext_v16i16i32: 691; VBITS_GE_512: // %bb.0: 692; VBITS_GE_512-NEXT: ptrue p0.s, vl16 693; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x1] 694; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 695; VBITS_GE_512-NEXT: ld1h { z0.s }, p1/z, [x0] 696; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] 697; VBITS_GE_512-NEXT: ret 698 %b = load <16 x i16>, ptr %bp 699 %mask = icmp eq <16 x i16> %b, zeroinitializer 700 %load = call <16 x i16> @llvm.masked.load.v16i16(ptr %ap, i32 8, <16 x i1> %mask, <16 x i16> undef) 701 %ext = zext <16 x i16> %load to <16 x i32> 702 store <16 x i32> %ext, ptr %c 703 ret void 704} 705 706define void @masked_load_zext_v8i16i64(ptr %ap, ptr %bp, ptr %c) #0 { 707; VBITS_GE_256-LABEL: masked_load_zext_v8i16i64: 708; VBITS_GE_256: // %bb.0: 709; VBITS_GE_256-NEXT: ldr q0, [x1] 710; VBITS_GE_256-NEXT: ptrue p0.h, vl8 711; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 712; VBITS_GE_256-NEXT: cmeq v0.8h, v0.8h, #0 713; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z0.h, #0 714; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] 715; VBITS_GE_256-NEXT: ptrue p0.d, vl4 716; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 717; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h 718; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h 719; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s 720; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s 721; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2] 722; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3] 723; VBITS_GE_256-NEXT: ret 724; 725; VBITS_GE_512-LABEL: masked_load_zext_v8i16i64: 726; VBITS_GE_512: // %bb.0: 727; VBITS_GE_512-NEXT: ptrue p0.d, vl8 728; VBITS_GE_512-NEXT: ld1h { z0.d }, p0/z, [x1] 729; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 730; VBITS_GE_512-NEXT: ld1h { z0.d }, p1/z, [x0] 731; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] 732; VBITS_GE_512-NEXT: ret 733 %b = load <8 x i16>, ptr %bp 734 %mask = icmp eq <8 x i16> %b, zeroinitializer 735 %load = call <8 x i16> @llvm.masked.load.v8i16(ptr %ap, i32 8, <8 x i1> %mask, <8 x i16> undef) 736 %ext = zext <8 x i16> %load to <8 x i64> 737 store <8 x i64> %ext, ptr %c 738 ret void 739} 740 741define void @masked_load_zext_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 { 742; VBITS_GE_256-LABEL: masked_load_zext_v8i32i64: 743; VBITS_GE_256: // %bb.0: 744; VBITS_GE_256-NEXT: ptrue p0.s, vl8 745; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 746; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1] 747; VBITS_GE_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0 748; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] 749; VBITS_GE_256-NEXT: ptrue p0.d, vl4 750; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s 751; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 752; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s 753; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2] 754; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] 755; VBITS_GE_256-NEXT: ret 756; 757; VBITS_GE_512-LABEL: masked_load_zext_v8i32i64: 758; VBITS_GE_512: // %bb.0: 759; VBITS_GE_512-NEXT: ptrue p0.d, vl8 760; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x1] 761; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 762; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [x0] 763; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] 764; VBITS_GE_512-NEXT: ret 765 %b = load <8 x i32>, ptr %bp 766 %mask = icmp eq <8 x i32> %b, zeroinitializer 767 %load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef) 768 %ext = zext <8 x i32> %load to <8 x i64> 769 store <8 x i64> %ext, ptr %c 770 ret void 771} 772 773define void @masked_load_sext_v32i8i16_m16(ptr %ap, ptr %bp, ptr %c) #0 { 774; VBITS_GE_256-LABEL: masked_load_sext_v32i8i16_m16: 775; VBITS_GE_256: // %bb.0: 776; VBITS_GE_256-NEXT: ptrue p0.h, vl16 777; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 778; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1, x8, lsl #1] 779; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] 780; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, #0 781; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z1.h, #0 782; VBITS_GE_256-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff 783; VBITS_GE_256-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff 784; VBITS_GE_256-NEXT: ptrue p1.b, vl16 785; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b 786; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b 787; VBITS_GE_256-NEXT: splice z1.b, p1, z1.b, z0.b 788; VBITS_GE_256-NEXT: ptrue p1.b, vl32 789; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z1.b, #0 790; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0] 791; VBITS_GE_256-NEXT: sunpklo z1.h, z0.b 792; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 793; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b 794; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2] 795; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] 796; VBITS_GE_256-NEXT: ret 797; 798; VBITS_GE_512-LABEL: masked_load_sext_v32i8i16_m16: 799; VBITS_GE_512: // %bb.0: 800; VBITS_GE_512-NEXT: ptrue p0.h, vl32 801; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x1] 802; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, #0 803; VBITS_GE_512-NEXT: ld1sb { z0.h }, p1/z, [x0] 804; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2] 805; VBITS_GE_512-NEXT: ret 806 %b = load <32 x i16>, ptr %bp 807 %mask = icmp eq <32 x i16> %b, zeroinitializer 808 %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef) 809 %ext = sext <32 x i8> %load to <32 x i16> 810 store <32 x i16> %ext, ptr %c 811 ret void 812} 813 814define void @masked_load_sext_v16i8i32_m32(ptr %ap, ptr %bp, ptr %c) #0 { 815; VBITS_GE_256-LABEL: masked_load_sext_v16i8i32_m32: 816; VBITS_GE_256: // %bb.0: 817; VBITS_GE_256-NEXT: ptrue p0.s, vl8 818; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 819; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x8, lsl #2] 820; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] 821; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0 822; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z1.s, #0 823; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff 824; VBITS_GE_256-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff 825; VBITS_GE_256-NEXT: ptrue p1.b, vl16 826; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h 827; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h 828; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b 829; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b 830; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0] 831; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z1.b, #0 832; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0] 833; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 834; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b 835; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b 836; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h 837; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h 838; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2] 839; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x8, lsl #2] 840; VBITS_GE_256-NEXT: ret 841; 842; VBITS_GE_512-LABEL: masked_load_sext_v16i8i32_m32: 843; VBITS_GE_512: // %bb.0: 844; VBITS_GE_512-NEXT: ptrue p0.s, vl16 845; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1] 846; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 847; VBITS_GE_512-NEXT: ld1sb { z0.s }, p1/z, [x0] 848; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] 849; VBITS_GE_512-NEXT: ret 850 %b = load <16 x i32>, ptr %bp 851 %mask = icmp eq <16 x i32> %b, zeroinitializer 852 %load = call <16 x i8> @llvm.masked.load.v16i8(ptr %ap, i32 8, <16 x i1> %mask, <16 x i8> undef) 853 %ext = sext <16 x i8> %load to <16 x i32> 854 store <16 x i32> %ext, ptr %c 855 ret void 856} 857 858define void @masked_load_sext_v8i8i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { 859; VBITS_GE_256-LABEL: masked_load_sext_v8i8i64_m64: 860; VBITS_GE_256: // %bb.0: 861; VBITS_GE_256-NEXT: ptrue p0.d, vl4 862; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 863; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] 864; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] 865; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0 866; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0 867; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff 868; VBITS_GE_256-NEXT: mov z1.d, p2/z, #-1 // =0xffffffffffffffff 869; VBITS_GE_256-NEXT: ptrue p1.s, vl4 870; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s 871; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s 872; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s 873; VBITS_GE_256-NEXT: ptrue p1.b, vl8 874; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h 875; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b 876; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z0.b, #0 877; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0] 878; VBITS_GE_256-NEXT: sshll v0.8h, v0.8b, #0 879; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 880; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h 881; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h 882; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s 883; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s 884; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2] 885; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3] 886; VBITS_GE_256-NEXT: ret 887; 888; VBITS_GE_512-LABEL: masked_load_sext_v8i8i64_m64: 889; VBITS_GE_512: // %bb.0: 890; VBITS_GE_512-NEXT: ptrue p0.d, vl8 891; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] 892; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 893; VBITS_GE_512-NEXT: ld1sb { z0.d }, p1/z, [x0] 894; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] 895; VBITS_GE_512-NEXT: ret 896 %b = load <8 x i64>, ptr %bp 897 %mask = icmp eq <8 x i64> %b, zeroinitializer 898 %load = call <8 x i8> @llvm.masked.load.v8i8(ptr %ap, i32 8, <8 x i1> %mask, <8 x i8> undef) 899 %ext = sext <8 x i8> %load to <8 x i64> 900 store <8 x i64> %ext, ptr %c 901 ret void 902} 903 904define void @masked_load_sext_v16i16i32_m32(ptr %ap, ptr %bp, ptr %c) #0 { 905; VBITS_GE_256-LABEL: masked_load_sext_v16i16i32_m32: 906; VBITS_GE_256: // %bb.0: 907; VBITS_GE_256-NEXT: ptrue p0.s, vl8 908; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 909; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x8, lsl #2] 910; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] 911; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0 912; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z1.s, #0 913; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff 914; VBITS_GE_256-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff 915; VBITS_GE_256-NEXT: ptrue p1.h, vl16 916; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h 917; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h 918; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b 919; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b 920; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0] 921; VBITS_GE_256-NEXT: sunpklo z0.h, z1.b 922; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z0.h, #0 923; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0] 924; VBITS_GE_256-NEXT: sunpklo z1.s, z0.h 925; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 926; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h 927; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2] 928; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] 929; VBITS_GE_256-NEXT: ret 930; 931; VBITS_GE_512-LABEL: masked_load_sext_v16i16i32_m32: 932; VBITS_GE_512: // %bb.0: 933; VBITS_GE_512-NEXT: ptrue p0.s, vl16 934; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1] 935; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 936; VBITS_GE_512-NEXT: ld1sh { z0.s }, p1/z, [x0] 937; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] 938; VBITS_GE_512-NEXT: ret 939 %b = load <16 x i32>, ptr %bp 940 %mask = icmp eq <16 x i32> %b, zeroinitializer 941 %load = call <16 x i16> @llvm.masked.load.v16i16(ptr %ap, i32 8, <16 x i1> %mask, <16 x i16> undef) 942 %ext = sext <16 x i16> %load to <16 x i32> 943 store <16 x i32> %ext, ptr %c 944 ret void 945} 946 947define void @masked_load_sext_v8i16i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { 948; VBITS_GE_256-LABEL: masked_load_sext_v8i16i64_m64: 949; VBITS_GE_256: // %bb.0: 950; VBITS_GE_256-NEXT: ptrue p0.d, vl4 951; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 952; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] 953; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] 954; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0 955; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0 956; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff 957; VBITS_GE_256-NEXT: mov z1.d, p2/z, #-1 // =0xffffffffffffffff 958; VBITS_GE_256-NEXT: ptrue p1.s, vl4 959; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s 960; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s 961; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s 962; VBITS_GE_256-NEXT: ptrue p1.h, vl8 963; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h 964; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z0.h, #0 965; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0] 966; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 967; VBITS_GE_256-NEXT: sunpklo z0.s, z0.h 968; VBITS_GE_256-NEXT: sunpklo z1.s, z1.h 969; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s 970; VBITS_GE_256-NEXT: sunpklo z1.d, z1.s 971; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2] 972; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3] 973; VBITS_GE_256-NEXT: ret 974; 975; VBITS_GE_512-LABEL: masked_load_sext_v8i16i64_m64: 976; VBITS_GE_512: // %bb.0: 977; VBITS_GE_512-NEXT: ptrue p0.d, vl8 978; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] 979; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 980; VBITS_GE_512-NEXT: ld1sh { z0.d }, p1/z, [x0] 981; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] 982; VBITS_GE_512-NEXT: ret 983 %b = load <8 x i64>, ptr %bp 984 %mask = icmp eq <8 x i64> %b, zeroinitializer 985 %load = call <8 x i16> @llvm.masked.load.v8i16(ptr %ap, i32 8, <8 x i1> %mask, <8 x i16> undef) 986 %ext = sext <8 x i16> %load to <8 x i64> 987 store <8 x i64> %ext, ptr %c 988 ret void 989} 990 991define void @masked_load_sext_v8i32i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { 992; VBITS_GE_256-LABEL: masked_load_sext_v8i32i64_m64: 993; VBITS_GE_256: // %bb.0: 994; VBITS_GE_256-NEXT: ptrue p0.d, vl4 995; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 996; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] 997; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] 998; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0 999; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0 1000; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff 1001; VBITS_GE_256-NEXT: mov z1.d, p2/z, #-1 // =0xffffffffffffffff 1002; VBITS_GE_256-NEXT: ptrue p1.s, vl4 1003; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s 1004; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s 1005; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s 1006; VBITS_GE_256-NEXT: ptrue p1.s, vl8 1007; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z1.s, #0 1008; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0] 1009; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s 1010; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 1011; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s 1012; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2] 1013; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] 1014; VBITS_GE_256-NEXT: ret 1015; 1016; VBITS_GE_512-LABEL: masked_load_sext_v8i32i64_m64: 1017; VBITS_GE_512: // %bb.0: 1018; VBITS_GE_512-NEXT: ptrue p0.d, vl8 1019; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] 1020; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 1021; VBITS_GE_512-NEXT: ld1sw { z0.d }, p1/z, [x0] 1022; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] 1023; VBITS_GE_512-NEXT: ret 1024 %b = load <8 x i64>, ptr %bp 1025 %mask = icmp eq <8 x i64> %b, zeroinitializer 1026 %load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef) 1027 %ext = sext <8 x i32> %load to <8 x i64> 1028 store <8 x i64> %ext, ptr %c 1029 ret void 1030} 1031 1032define void @masked_load_zext_v32i8i16_m16(ptr %ap, ptr %bp, ptr %c) #0 { 1033; VBITS_GE_256-LABEL: masked_load_zext_v32i8i16_m16: 1034; VBITS_GE_256: // %bb.0: 1035; VBITS_GE_256-NEXT: ptrue p0.h, vl16 1036; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 1037; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x1, x8, lsl #1] 1038; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] 1039; VBITS_GE_256-NEXT: cmpeq p1.h, p0/z, z0.h, #0 1040; VBITS_GE_256-NEXT: cmpeq p2.h, p0/z, z1.h, #0 1041; VBITS_GE_256-NEXT: mov z0.h, p1/z, #-1 // =0xffffffffffffffff 1042; VBITS_GE_256-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff 1043; VBITS_GE_256-NEXT: ptrue p1.b, vl16 1044; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b 1045; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b 1046; VBITS_GE_256-NEXT: splice z1.b, p1, z1.b, z0.b 1047; VBITS_GE_256-NEXT: ptrue p1.b, vl32 1048; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z1.b, #0 1049; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0] 1050; VBITS_GE_256-NEXT: uunpklo z1.h, z0.b 1051; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 1052; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b 1053; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x2] 1054; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] 1055; VBITS_GE_256-NEXT: ret 1056; 1057; VBITS_GE_512-LABEL: masked_load_zext_v32i8i16_m16: 1058; VBITS_GE_512: // %bb.0: 1059; VBITS_GE_512-NEXT: ptrue p0.h, vl32 1060; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x1] 1061; VBITS_GE_512-NEXT: cmpeq p1.h, p0/z, z0.h, #0 1062; VBITS_GE_512-NEXT: ld1b { z0.h }, p1/z, [x0] 1063; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x2] 1064; VBITS_GE_512-NEXT: ret 1065 %b = load <32 x i16>, ptr %bp 1066 %mask = icmp eq <32 x i16> %b, zeroinitializer 1067 %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef) 1068 %ext = zext <32 x i8> %load to <32 x i16> 1069 store <32 x i16> %ext, ptr %c 1070 ret void 1071} 1072 1073define void @masked_load_zext_v16i8i32_m32(ptr %ap, ptr %bp, ptr %c) #0 { 1074; VBITS_GE_256-LABEL: masked_load_zext_v16i8i32_m32: 1075; VBITS_GE_256: // %bb.0: 1076; VBITS_GE_256-NEXT: ptrue p0.s, vl8 1077; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 1078; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x8, lsl #2] 1079; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] 1080; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0 1081; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z1.s, #0 1082; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff 1083; VBITS_GE_256-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff 1084; VBITS_GE_256-NEXT: ptrue p1.b, vl16 1085; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h 1086; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h 1087; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b 1088; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b 1089; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0] 1090; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z1.b, #0 1091; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0] 1092; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 1093; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b 1094; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b 1095; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h 1096; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h 1097; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2] 1098; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2, x8, lsl #2] 1099; VBITS_GE_256-NEXT: ret 1100; 1101; VBITS_GE_512-LABEL: masked_load_zext_v16i8i32_m32: 1102; VBITS_GE_512: // %bb.0: 1103; VBITS_GE_512-NEXT: ptrue p0.s, vl16 1104; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1] 1105; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 1106; VBITS_GE_512-NEXT: ld1b { z0.s }, p1/z, [x0] 1107; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] 1108; VBITS_GE_512-NEXT: ret 1109 %b = load <16 x i32>, ptr %bp 1110 %mask = icmp eq <16 x i32> %b, zeroinitializer 1111 %load = call <16 x i8> @llvm.masked.load.v16i8(ptr %ap, i32 8, <16 x i1> %mask, <16 x i8> undef) 1112 %ext = zext <16 x i8> %load to <16 x i32> 1113 store <16 x i32> %ext, ptr %c 1114 ret void 1115} 1116 1117define void @masked_load_zext_v8i8i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { 1118; VBITS_GE_256-LABEL: masked_load_zext_v8i8i64_m64: 1119; VBITS_GE_256: // %bb.0: 1120; VBITS_GE_256-NEXT: ptrue p0.d, vl4 1121; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 1122; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] 1123; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] 1124; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0 1125; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0 1126; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff 1127; VBITS_GE_256-NEXT: mov z1.d, p2/z, #-1 // =0xffffffffffffffff 1128; VBITS_GE_256-NEXT: ptrue p1.s, vl4 1129; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s 1130; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s 1131; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s 1132; VBITS_GE_256-NEXT: ptrue p1.b, vl8 1133; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h 1134; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b 1135; VBITS_GE_256-NEXT: cmpne p1.b, p1/z, z0.b, #0 1136; VBITS_GE_256-NEXT: ld1b { z0.b }, p1/z, [x0] 1137; VBITS_GE_256-NEXT: ushll v0.8h, v0.8b, #0 1138; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 1139; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h 1140; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h 1141; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s 1142; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s 1143; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2] 1144; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3] 1145; VBITS_GE_256-NEXT: ret 1146; 1147; VBITS_GE_512-LABEL: masked_load_zext_v8i8i64_m64: 1148; VBITS_GE_512: // %bb.0: 1149; VBITS_GE_512-NEXT: ptrue p0.d, vl8 1150; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] 1151; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 1152; VBITS_GE_512-NEXT: ld1b { z0.d }, p1/z, [x0] 1153; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] 1154; VBITS_GE_512-NEXT: ret 1155 %b = load <8 x i64>, ptr %bp 1156 %mask = icmp eq <8 x i64> %b, zeroinitializer 1157 %load = call <8 x i8> @llvm.masked.load.v8i8(ptr %ap, i32 8, <8 x i1> %mask, <8 x i8> undef) 1158 %ext = zext <8 x i8> %load to <8 x i64> 1159 store <8 x i64> %ext, ptr %c 1160 ret void 1161} 1162 1163define void @masked_load_zext_v16i16i32_m32(ptr %ap, ptr %bp, ptr %c) #0 { 1164; VBITS_GE_256-LABEL: masked_load_zext_v16i16i32_m32: 1165; VBITS_GE_256: // %bb.0: 1166; VBITS_GE_256-NEXT: ptrue p0.s, vl8 1167; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 1168; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1, x8, lsl #2] 1169; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] 1170; VBITS_GE_256-NEXT: cmpeq p1.s, p0/z, z0.s, #0 1171; VBITS_GE_256-NEXT: cmpeq p2.s, p0/z, z1.s, #0 1172; VBITS_GE_256-NEXT: mov z0.s, p1/z, #-1 // =0xffffffffffffffff 1173; VBITS_GE_256-NEXT: mov z1.s, p2/z, #-1 // =0xffffffffffffffff 1174; VBITS_GE_256-NEXT: ptrue p1.h, vl16 1175; VBITS_GE_256-NEXT: uzp1 z0.h, z0.h, z0.h 1176; VBITS_GE_256-NEXT: uzp1 z1.h, z1.h, z1.h 1177; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b 1178; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b 1179; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0] 1180; VBITS_GE_256-NEXT: sunpklo z0.h, z1.b 1181; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z0.h, #0 1182; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0] 1183; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h 1184; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 1185; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h 1186; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x2] 1187; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] 1188; VBITS_GE_256-NEXT: ret 1189; 1190; VBITS_GE_512-LABEL: masked_load_zext_v16i16i32_m32: 1191; VBITS_GE_512: // %bb.0: 1192; VBITS_GE_512-NEXT: ptrue p0.s, vl16 1193; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x1] 1194; VBITS_GE_512-NEXT: cmpeq p1.s, p0/z, z0.s, #0 1195; VBITS_GE_512-NEXT: ld1h { z0.s }, p1/z, [x0] 1196; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x2] 1197; VBITS_GE_512-NEXT: ret 1198 %b = load <16 x i32>, ptr %bp 1199 %mask = icmp eq <16 x i32> %b, zeroinitializer 1200 %load = call <16 x i16> @llvm.masked.load.v16i16(ptr %ap, i32 8, <16 x i1> %mask, <16 x i16> undef) 1201 %ext = zext <16 x i16> %load to <16 x i32> 1202 store <16 x i32> %ext, ptr %c 1203 ret void 1204} 1205 1206define void @masked_load_zext_v8i16i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { 1207; VBITS_GE_256-LABEL: masked_load_zext_v8i16i64_m64: 1208; VBITS_GE_256: // %bb.0: 1209; VBITS_GE_256-NEXT: ptrue p0.d, vl4 1210; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 1211; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] 1212; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] 1213; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0 1214; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0 1215; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff 1216; VBITS_GE_256-NEXT: mov z1.d, p2/z, #-1 // =0xffffffffffffffff 1217; VBITS_GE_256-NEXT: ptrue p1.s, vl4 1218; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s 1219; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s 1220; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s 1221; VBITS_GE_256-NEXT: ptrue p1.h, vl8 1222; VBITS_GE_256-NEXT: uzp1 z0.h, z1.h, z1.h 1223; VBITS_GE_256-NEXT: cmpne p1.h, p1/z, z0.h, #0 1224; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0] 1225; VBITS_GE_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 1226; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h 1227; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h 1228; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s 1229; VBITS_GE_256-NEXT: uunpklo z1.d, z1.s 1230; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2] 1231; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2, x8, lsl #3] 1232; VBITS_GE_256-NEXT: ret 1233; 1234; VBITS_GE_512-LABEL: masked_load_zext_v8i16i64_m64: 1235; VBITS_GE_512: // %bb.0: 1236; VBITS_GE_512-NEXT: ptrue p0.d, vl8 1237; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] 1238; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 1239; VBITS_GE_512-NEXT: ld1h { z0.d }, p1/z, [x0] 1240; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] 1241; VBITS_GE_512-NEXT: ret 1242 %b = load <8 x i64>, ptr %bp 1243 %mask = icmp eq <8 x i64> %b, zeroinitializer 1244 %load = call <8 x i16> @llvm.masked.load.v8i16(ptr %ap, i32 8, <8 x i1> %mask, <8 x i16> undef) 1245 %ext = zext <8 x i16> %load to <8 x i64> 1246 store <8 x i64> %ext, ptr %c 1247 ret void 1248} 1249 1250define void @masked_load_zext_v8i32i64_m64(ptr %ap, ptr %bp, ptr %c) #0 { 1251; VBITS_GE_256-LABEL: masked_load_zext_v8i32i64_m64: 1252; VBITS_GE_256: // %bb.0: 1253; VBITS_GE_256-NEXT: ptrue p0.d, vl4 1254; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 1255; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] 1256; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1] 1257; VBITS_GE_256-NEXT: cmpeq p1.d, p0/z, z0.d, #0 1258; VBITS_GE_256-NEXT: cmpeq p2.d, p0/z, z1.d, #0 1259; VBITS_GE_256-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff 1260; VBITS_GE_256-NEXT: mov z1.d, p2/z, #-1 // =0xffffffffffffffff 1261; VBITS_GE_256-NEXT: ptrue p1.s, vl4 1262; VBITS_GE_256-NEXT: uzp1 z0.s, z0.s, z0.s 1263; VBITS_GE_256-NEXT: uzp1 z1.s, z1.s, z1.s 1264; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z0.s 1265; VBITS_GE_256-NEXT: ptrue p1.s, vl8 1266; VBITS_GE_256-NEXT: cmpne p1.s, p1/z, z1.s, #0 1267; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0] 1268; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s 1269; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 1270; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s 1271; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2] 1272; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] 1273; VBITS_GE_256-NEXT: ret 1274; 1275; VBITS_GE_512-LABEL: masked_load_zext_v8i32i64_m64: 1276; VBITS_GE_512: // %bb.0: 1277; VBITS_GE_512-NEXT: ptrue p0.d, vl8 1278; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] 1279; VBITS_GE_512-NEXT: cmpeq p1.d, p0/z, z0.d, #0 1280; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [x0] 1281; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] 1282; VBITS_GE_512-NEXT: ret 1283 %b = load <8 x i64>, ptr %bp 1284 %mask = icmp eq <8 x i64> %b, zeroinitializer 1285 %load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef) 1286 %ext = zext <8 x i32> %load to <8 x i64> 1287 store <8 x i64> %ext, ptr %c 1288 ret void 1289} 1290 1291define void @masked_load_sext_v128i8i16(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 { 1292; CHECK-LABEL: masked_load_sext_v128i8i16: 1293; CHECK: // %bb.0: 1294; CHECK-NEXT: ptrue p0.h, vl128 1295; CHECK-NEXT: ld1b { z0.h }, p0/z, [x1] 1296; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, #0 1297; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0] 1298; CHECK-NEXT: st1h { z0.h }, p0, [x2] 1299; CHECK-NEXT: ret 1300 %b = load <128 x i8>, ptr %bp 1301 %mask = icmp eq <128 x i8> %b, zeroinitializer 1302 %load = call <128 x i8> @llvm.masked.load.v128i8(ptr %ap, i32 8, <128 x i1> %mask, <128 x i8> undef) 1303 %ext = sext <128 x i8> %load to <128 x i16> 1304 store <128 x i16> %ext, ptr %c 1305 ret void 1306} 1307 1308define void @masked_load_sext_v64i8i32(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 { 1309; CHECK-LABEL: masked_load_sext_v64i8i32: 1310; CHECK: // %bb.0: 1311; CHECK-NEXT: ptrue p0.s, vl64 1312; CHECK-NEXT: ld1b { z0.s }, p0/z, [x1] 1313; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0 1314; CHECK-NEXT: ld1sb { z0.s }, p1/z, [x0] 1315; CHECK-NEXT: st1w { z0.s }, p0, [x2] 1316; CHECK-NEXT: ret 1317 %b = load <64 x i8>, ptr %bp 1318 %mask = icmp eq <64 x i8> %b, zeroinitializer 1319 %load = call <64 x i8> @llvm.masked.load.v64i8(ptr %ap, i32 8, <64 x i1> %mask, <64 x i8> undef) 1320 %ext = sext <64 x i8> %load to <64 x i32> 1321 store <64 x i32> %ext, ptr %c 1322 ret void 1323} 1324 1325define void @masked_load_sext_v32i8i64(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 { 1326; CHECK-LABEL: masked_load_sext_v32i8i64: 1327; CHECK: // %bb.0: 1328; CHECK-NEXT: ptrue p0.d, vl32 1329; CHECK-NEXT: ld1b { z0.d }, p0/z, [x1] 1330; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 1331; CHECK-NEXT: ld1sb { z0.d }, p1/z, [x0] 1332; CHECK-NEXT: st1d { z0.d }, p0, [x2] 1333; CHECK-NEXT: ret 1334 %b = load <32 x i8>, ptr %bp 1335 %mask = icmp eq <32 x i8> %b, zeroinitializer 1336 %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef) 1337 %ext = sext <32 x i8> %load to <32 x i64> 1338 store <32 x i64> %ext, ptr %c 1339 ret void 1340} 1341 1342define void @masked_load_sext_v64i16i32(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 { 1343; CHECK-LABEL: masked_load_sext_v64i16i32: 1344; CHECK: // %bb.0: 1345; CHECK-NEXT: ptrue p0.s, vl64 1346; CHECK-NEXT: ld1h { z0.s }, p0/z, [x1] 1347; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0 1348; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0] 1349; CHECK-NEXT: st1w { z0.s }, p0, [x2] 1350; CHECK-NEXT: ret 1351 %b = load <64 x i16>, ptr %bp 1352 %mask = icmp eq <64 x i16> %b, zeroinitializer 1353 %load = call <64 x i16> @llvm.masked.load.v64i16(ptr %ap, i32 8, <64 x i1> %mask, <64 x i16> undef) 1354 %ext = sext <64 x i16> %load to <64 x i32> 1355 store <64 x i32> %ext, ptr %c 1356 ret void 1357} 1358 1359define void @masked_load_sext_v32i16i64(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 { 1360; CHECK-LABEL: masked_load_sext_v32i16i64: 1361; CHECK: // %bb.0: 1362; CHECK-NEXT: ptrue p0.d, vl32 1363; CHECK-NEXT: ld1h { z0.d }, p0/z, [x1] 1364; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 1365; CHECK-NEXT: ld1sh { z0.d }, p1/z, [x0] 1366; CHECK-NEXT: st1d { z0.d }, p0, [x2] 1367; CHECK-NEXT: ret 1368 %b = load <32 x i16>, ptr %bp 1369 %mask = icmp eq <32 x i16> %b, zeroinitializer 1370 %load = call <32 x i16> @llvm.masked.load.v32i16(ptr %ap, i32 8, <32 x i1> %mask, <32 x i16> undef) 1371 %ext = sext <32 x i16> %load to <32 x i64> 1372 store <32 x i64> %ext, ptr %c 1373 ret void 1374} 1375 1376define void @masked_load_sext_v32i32i64(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 { 1377; CHECK-LABEL: masked_load_sext_v32i32i64: 1378; CHECK: // %bb.0: 1379; CHECK-NEXT: ptrue p0.d, vl32 1380; CHECK-NEXT: ld1w { z0.d }, p0/z, [x1] 1381; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 1382; CHECK-NEXT: ld1sw { z0.d }, p1/z, [x0] 1383; CHECK-NEXT: st1d { z0.d }, p0, [x2] 1384; CHECK-NEXT: ret 1385 %b = load <32 x i32>, ptr %bp 1386 %mask = icmp eq <32 x i32> %b, zeroinitializer 1387 %load = call <32 x i32> @llvm.masked.load.v32i32(ptr %ap, i32 8, <32 x i1> %mask, <32 x i32> undef) 1388 %ext = sext <32 x i32> %load to <32 x i64> 1389 store <32 x i64> %ext, ptr %c 1390 ret void 1391} 1392 1393define void @masked_load_zext_v128i8i16(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 { 1394; CHECK-LABEL: masked_load_zext_v128i8i16: 1395; CHECK: // %bb.0: 1396; CHECK-NEXT: ptrue p0.h, vl128 1397; CHECK-NEXT: ld1b { z0.h }, p0/z, [x1] 1398; CHECK-NEXT: cmpeq p1.h, p0/z, z0.h, #0 1399; CHECK-NEXT: ld1b { z0.h }, p1/z, [x0] 1400; CHECK-NEXT: st1h { z0.h }, p0, [x2] 1401; CHECK-NEXT: ret 1402 %b = load <128 x i8>, ptr %bp 1403 %mask = icmp eq <128 x i8> %b, zeroinitializer 1404 %load = call <128 x i8> @llvm.masked.load.v128i8(ptr %ap, i32 8, <128 x i1> %mask, <128 x i8> undef) 1405 %ext = zext <128 x i8> %load to <128 x i16> 1406 store <128 x i16> %ext, ptr %c 1407 ret void 1408} 1409 1410define void @masked_load_zext_v64i8i32(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 { 1411; CHECK-LABEL: masked_load_zext_v64i8i32: 1412; CHECK: // %bb.0: 1413; CHECK-NEXT: ptrue p0.s, vl64 1414; CHECK-NEXT: ld1b { z0.s }, p0/z, [x1] 1415; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0 1416; CHECK-NEXT: ld1b { z0.s }, p1/z, [x0] 1417; CHECK-NEXT: st1w { z0.s }, p0, [x2] 1418; CHECK-NEXT: ret 1419 %b = load <64 x i8>, ptr %bp 1420 %mask = icmp eq <64 x i8> %b, zeroinitializer 1421 %load = call <64 x i8> @llvm.masked.load.v64i8(ptr %ap, i32 8, <64 x i1> %mask, <64 x i8> undef) 1422 %ext = zext <64 x i8> %load to <64 x i32> 1423 store <64 x i32> %ext, ptr %c 1424 ret void 1425} 1426 1427define void @masked_load_zext_v32i8i64(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 { 1428; CHECK-LABEL: masked_load_zext_v32i8i64: 1429; CHECK: // %bb.0: 1430; CHECK-NEXT: ptrue p0.d, vl32 1431; CHECK-NEXT: ld1b { z0.d }, p0/z, [x1] 1432; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 1433; CHECK-NEXT: ld1b { z0.d }, p1/z, [x0] 1434; CHECK-NEXT: st1d { z0.d }, p0, [x2] 1435; CHECK-NEXT: ret 1436 %b = load <32 x i8>, ptr %bp 1437 %mask = icmp eq <32 x i8> %b, zeroinitializer 1438 %load = call <32 x i8> @llvm.masked.load.v32i8(ptr %ap, i32 8, <32 x i1> %mask, <32 x i8> undef) 1439 %ext = zext <32 x i8> %load to <32 x i64> 1440 store <32 x i64> %ext, ptr %c 1441 ret void 1442} 1443 1444define void @masked_load_zext_v64i16i32(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 { 1445; CHECK-LABEL: masked_load_zext_v64i16i32: 1446; CHECK: // %bb.0: 1447; CHECK-NEXT: ptrue p0.s, vl64 1448; CHECK-NEXT: ld1h { z0.s }, p0/z, [x1] 1449; CHECK-NEXT: cmpeq p1.s, p0/z, z0.s, #0 1450; CHECK-NEXT: ld1h { z0.s }, p1/z, [x0] 1451; CHECK-NEXT: st1w { z0.s }, p0, [x2] 1452; CHECK-NEXT: ret 1453 %b = load <64 x i16>, ptr %bp 1454 %mask = icmp eq <64 x i16> %b, zeroinitializer 1455 %load = call <64 x i16> @llvm.masked.load.v64i16(ptr %ap, i32 8, <64 x i1> %mask, <64 x i16> undef) 1456 %ext = zext <64 x i16> %load to <64 x i32> 1457 store <64 x i32> %ext, ptr %c 1458 ret void 1459} 1460 1461define void @masked_load_zext_v32i16i64(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 { 1462; CHECK-LABEL: masked_load_zext_v32i16i64: 1463; CHECK: // %bb.0: 1464; CHECK-NEXT: ptrue p0.d, vl32 1465; CHECK-NEXT: ld1h { z0.d }, p0/z, [x1] 1466; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 1467; CHECK-NEXT: ld1h { z0.d }, p1/z, [x0] 1468; CHECK-NEXT: st1d { z0.d }, p0, [x2] 1469; CHECK-NEXT: ret 1470 %b = load <32 x i16>, ptr %bp 1471 %mask = icmp eq <32 x i16> %b, zeroinitializer 1472 %load = call <32 x i16> @llvm.masked.load.v32i16(ptr %ap, i32 8, <32 x i1> %mask, <32 x i16> undef) 1473 %ext = zext <32 x i16> %load to <32 x i64> 1474 store <32 x i64> %ext, ptr %c 1475 ret void 1476} 1477 1478define void @masked_load_zext_v32i32i64(ptr %ap, ptr %bp, ptr %c) vscale_range(16,0) #0 { 1479; CHECK-LABEL: masked_load_zext_v32i32i64: 1480; CHECK: // %bb.0: 1481; CHECK-NEXT: ptrue p0.d, vl32 1482; CHECK-NEXT: ld1w { z0.d }, p0/z, [x1] 1483; CHECK-NEXT: cmpeq p1.d, p0/z, z0.d, #0 1484; CHECK-NEXT: ld1w { z0.d }, p1/z, [x0] 1485; CHECK-NEXT: st1d { z0.d }, p0, [x2] 1486; CHECK-NEXT: ret 1487 %b = load <32 x i32>, ptr %bp 1488 %mask = icmp eq <32 x i32> %b, zeroinitializer 1489 %load = call <32 x i32> @llvm.masked.load.v32i32(ptr %ap, i32 8, <32 x i1> %mask, <32 x i32> undef) 1490 %ext = zext <32 x i32> %load to <32 x i64> 1491 store <32 x i64> %ext, ptr %c 1492 ret void 1493} 1494 1495define void @masked_load_sext_ugt_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 { 1496; VBITS_GE_256-LABEL: masked_load_sext_ugt_v8i32i64: 1497; VBITS_GE_256: // %bb.0: 1498; VBITS_GE_256-NEXT: ptrue p0.s, vl8 1499; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 1500; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1] 1501; VBITS_GE_256-NEXT: cmpne p0.s, p0/z, z0.s, #0 1502; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] 1503; VBITS_GE_256-NEXT: ptrue p0.d, vl4 1504; VBITS_GE_256-NEXT: sunpklo z1.d, z0.s 1505; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 1506; VBITS_GE_256-NEXT: sunpklo z0.d, z0.s 1507; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2] 1508; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] 1509; VBITS_GE_256-NEXT: ret 1510; 1511; VBITS_GE_512-LABEL: masked_load_sext_ugt_v8i32i64: 1512; VBITS_GE_512: // %bb.0: 1513; VBITS_GE_512-NEXT: ptrue p0.d, vl8 1514; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x1] 1515; VBITS_GE_512-NEXT: cmpne p1.d, p0/z, z0.d, #0 1516; VBITS_GE_512-NEXT: ld1sw { z0.d }, p1/z, [x0] 1517; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] 1518; VBITS_GE_512-NEXT: ret 1519 %b = load <8 x i32>, ptr %bp 1520 %mask = icmp ugt <8 x i32> %b, zeroinitializer 1521 %load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef) 1522 %ext = sext <8 x i32> %load to <8 x i64> 1523 store <8 x i64> %ext, ptr %c 1524 ret void 1525} 1526 1527define void @masked_load_zext_sgt_v8i32i64(ptr %ap, ptr %bp, ptr %c) #0 { 1528; VBITS_GE_256-LABEL: masked_load_zext_sgt_v8i32i64: 1529; VBITS_GE_256: // %bb.0: 1530; VBITS_GE_256-NEXT: ptrue p0.s, vl8 1531; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 1532; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x1] 1533; VBITS_GE_256-NEXT: cmpgt p0.s, p0/z, z0.s, #0 1534; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] 1535; VBITS_GE_256-NEXT: ptrue p0.d, vl4 1536; VBITS_GE_256-NEXT: uunpklo z1.d, z0.s 1537; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 1538; VBITS_GE_256-NEXT: uunpklo z0.d, z0.s 1539; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x2] 1540; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] 1541; VBITS_GE_256-NEXT: ret 1542; 1543; VBITS_GE_512-LABEL: masked_load_zext_sgt_v8i32i64: 1544; VBITS_GE_512: // %bb.0: 1545; VBITS_GE_512-NEXT: ptrue p0.d, vl8 1546; VBITS_GE_512-NEXT: ld1sw { z0.d }, p0/z, [x1] 1547; VBITS_GE_512-NEXT: cmpgt p1.d, p0/z, z0.d, #0 1548; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [x0] 1549; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x2] 1550; VBITS_GE_512-NEXT: ret 1551 %b = load <8 x i32>, ptr %bp 1552 %mask = icmp sgt <8 x i32> %b, zeroinitializer 1553 %load = call <8 x i32> @llvm.masked.load.v8i32(ptr %ap, i32 8, <8 x i1> %mask, <8 x i32> undef) 1554 %ext = zext <8 x i32> %load to <8 x i64> 1555 store <8 x i64> %ext, ptr %c 1556 ret void 1557} 1558 1559declare <2 x half> @llvm.masked.load.v2f16(ptr, i32, <2 x i1>, <2 x half>) 1560declare <2 x float> @llvm.masked.load.v2f32(ptr, i32, <2 x i1>, <2 x float>) 1561declare <4 x float> @llvm.masked.load.v4f32(ptr, i32, <4 x i1>, <4 x float>) 1562declare <8 x float> @llvm.masked.load.v8f32(ptr, i32, <8 x i1>, <8 x float>) 1563declare <16 x float> @llvm.masked.load.v16f32(ptr, i32, <16 x i1>, <16 x float>) 1564declare <32 x float> @llvm.masked.load.v32f32(ptr, i32, <32 x i1>, <32 x float>) 1565declare <64 x float> @llvm.masked.load.v64f32(ptr, i32, <64 x i1>, <64 x float>) 1566 1567declare <128 x i8> @llvm.masked.load.v128i8(ptr, i32, <128 x i1>, <128 x i8>) 1568declare <64 x i8> @llvm.masked.load.v64i8(ptr, i32, <64 x i1>, <64 x i8>) 1569declare <32 x i8> @llvm.masked.load.v32i8(ptr, i32, <32 x i1>, <32 x i8>) 1570declare <16 x i8> @llvm.masked.load.v16i8(ptr, i32, <16 x i1>, <16 x i8>) 1571declare <16 x i16> @llvm.masked.load.v16i16(ptr, i32, <16 x i1>, <16 x i16>) 1572declare <8 x i8> @llvm.masked.load.v8i8(ptr, i32, <8 x i1>, <8 x i8>) 1573declare <8 x i16> @llvm.masked.load.v8i16(ptr, i32, <8 x i1>, <8 x i16>) 1574declare <8 x i32> @llvm.masked.load.v8i32(ptr, i32, <8 x i1>, <8 x i32>) 1575declare <32 x i32> @llvm.masked.load.v32i32(ptr, i32, <32 x i1>, <32 x i32>) 1576declare <32 x i16> @llvm.masked.load.v32i16(ptr, i32, <32 x i1>, <32 x i16>) 1577declare <64 x i16> @llvm.masked.load.v64i16(ptr, i32, <64 x i1>, <64 x i16>) 1578declare <16 x i32> @llvm.masked.load.v16i32(ptr, i32, <16 x i1>, <16 x i32>) 1579declare <8 x i64> @llvm.masked.load.v8i64(ptr, i32, <8 x i1>, <8 x i64>) 1580declare <8 x double> @llvm.masked.load.v8f64(ptr, i32, <8 x i1>, <8 x double>) 1581 1582attributes #0 = { "target-features"="+sve" } 1583