1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s 3 4; 5; Masked Loads 6; 7 8define <vscale x 2 x i64> @masked_sload_nxv2i8(ptr %a, <vscale x 2 x i1> %mask) { 9; CHECK-LABEL: masked_sload_nxv2i8: 10; CHECK: // %bb.0: 11; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0] 12; CHECK-NEXT: ret 13 %load = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(ptr %a, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef) 14 %ext = sext <vscale x 2 x i8> %load to <vscale x 2 x i64> 15 ret <vscale x 2 x i64> %ext 16} 17 18define <vscale x 2 x i64> @masked_sload_nxv2i16(ptr %a, <vscale x 2 x i1> %mask) { 19; CHECK-LABEL: masked_sload_nxv2i16: 20; CHECK: // %bb.0: 21; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0] 22; CHECK-NEXT: ret 23 %load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(ptr %a, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef) 24 %ext = sext <vscale x 2 x i16> %load to <vscale x 2 x i64> 25 ret <vscale x 2 x i64> %ext 26} 27 28define <vscale x 2 x i64> @masked_sload_nxv2i32(ptr %a, <vscale x 2 x i1> %mask) { 29; CHECK-LABEL: masked_sload_nxv2i32: 30; CHECK: // %bb.0: 31; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0] 32; CHECK-NEXT: ret 33 %load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(ptr %a, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef) 34 %ext = sext <vscale x 2 x i32> %load to <vscale x 2 x i64> 35 ret <vscale x 2 x i64> %ext 36} 37 38define <vscale x 4 x i32> @masked_sload_nxv4i8(ptr %a, <vscale x 4 x i1> %mask) { 39; CHECK-LABEL: masked_sload_nxv4i8: 40; CHECK: // %bb.0: 41; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0] 42; CHECK-NEXT: ret 43 %load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(ptr %a, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef) 44 %ext = sext <vscale x 4 x i8> %load to <vscale x 4 x i32> 45 ret <vscale x 4 x i32> %ext 46} 47 48define <vscale x 4 x i32> @masked_sload_nxv4i16(ptr %a, <vscale x 4 x i1> %mask) { 49; CHECK-LABEL: masked_sload_nxv4i16: 50; CHECK: // %bb.0: 51; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0] 52; CHECK-NEXT: ret 53 %load = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(ptr %a, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef) 54 %ext = sext <vscale x 4 x i16> %load to <vscale x 4 x i32> 55 ret <vscale x 4 x i32> %ext 56} 57 58define <vscale x 8 x i16> @masked_sload_nxv8i8(ptr %a, <vscale x 8 x i1> %mask) { 59; CHECK-LABEL: masked_sload_nxv8i8: 60; CHECK: // %bb.0: 61; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0] 62; CHECK-NEXT: ret 63 %load = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(ptr %a, i32 1, <vscale x 8 x i1> %mask, <vscale x 8 x i8> undef) 64 %ext = sext <vscale x 8 x i8> %load to <vscale x 8 x i16> 65 ret <vscale x 8 x i16> %ext 66} 67 68define <vscale x 2 x i64> @masked_sload_passthru(ptr %a, <vscale x 2 x i1> %mask, <vscale x 2 x i32> %passthru) { 69; CHECK-LABEL: masked_sload_passthru: 70; CHECK: // %bb.0: 71; CHECK-NEXT: ptrue p1.d 72; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x0] 73; CHECK-NEXT: sxtw z0.d, p1/m, z0.d 74; CHECK-NEXT: mov z0.d, p0/m, z1.d 75; CHECK-NEXT: ret 76 %load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(ptr %a, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i32> %passthru) 77 %ext = sext <vscale x 2 x i32> %load to <vscale x 2 x i64> 78 ret <vscale x 2 x i64> %ext 79} 80 81; Return type requires splitting 82define <vscale x 16 x i32> @masked_sload_nxv16i8(ptr %a, <vscale x 16 x i1> %mask) { 83; CHECK-LABEL: masked_sload_nxv16i8: 84; CHECK: // %bb.0: 85; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 86; CHECK-NEXT: sunpklo z1.h, z0.b 87; CHECK-NEXT: sunpkhi z3.h, z0.b 88; CHECK-NEXT: sunpklo z0.s, z1.h 89; CHECK-NEXT: sunpkhi z1.s, z1.h 90; CHECK-NEXT: sunpklo z2.s, z3.h 91; CHECK-NEXT: sunpkhi z3.s, z3.h 92; CHECK-NEXT: ret 93 %load = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8(ptr %a, i32 2, <vscale x 16 x i1> %mask, <vscale x 16 x i8> undef) 94 %ext = sext <vscale x 16 x i8> %load to <vscale x 16 x i32> 95 ret <vscale x 16 x i32> %ext 96} 97 98; Masked load requires promotion 99define <vscale x 4 x double> @masked_sload_4i8_4f32(ptr noalias %in, <vscale x 4 x i1> %mask) { 100; CHECK-LABEL: masked_sload_4i8_4f32: 101; CHECK: // %bb.0: 102; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0] 103; CHECK-NEXT: ptrue p1.d 104; CHECK-NEXT: sunpkhi z1.d, z0.s 105; CHECK-NEXT: sunpklo z0.d, z0.s 106; CHECK-NEXT: scvtf z0.d, p1/m, z0.d 107; CHECK-NEXT: scvtf z1.d, p1/m, z1.d 108; CHECK-NEXT: ret 109 %wide.load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(ptr %in, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef) 110 %sext = sext <vscale x 4 x i8> %wide.load to <vscale x 4 x i64> 111 %res = sitofp <vscale x 4 x i64> %sext to <vscale x 4 x double> 112 ret <vscale x 4 x double> %res 113} 114 115 116; Extending loads from unpacked to wide illegal types 117 118define <vscale x 4 x i64> @masked_sload_4i8_4i64(ptr %a, <vscale x 4 x i1> %b) { 119; CHECK-LABEL: masked_sload_4i8_4i64: 120; CHECK: // %bb.0: 121; CHECK-NEXT: ld1sb { z1.s }, p0/z, [x0] 122; CHECK-NEXT: sunpklo z0.d, z1.s 123; CHECK-NEXT: sunpkhi z1.d, z1.s 124; CHECK-NEXT: ret 125 %aval = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(ptr %a, i32 16, <vscale x 4 x i1> %b, <vscale x 4 x i8> zeroinitializer) 126 %aext = sext <vscale x 4 x i8> %aval to <vscale x 4 x i64> 127 ret <vscale x 4 x i64> %aext 128} 129 130define <vscale x 4 x i64> @masked_sload_4i16_4i64(ptr %a, <vscale x 4 x i1> %b) { 131; CHECK-LABEL: masked_sload_4i16_4i64: 132; CHECK: // %bb.0: 133; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0] 134; CHECK-NEXT: sunpklo z0.d, z1.s 135; CHECK-NEXT: sunpkhi z1.d, z1.s 136; CHECK-NEXT: ret 137 %aval = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(ptr %a, i32 16, <vscale x 4 x i1> %b, <vscale x 4 x i16> zeroinitializer) 138 %aext = sext <vscale x 4 x i16> %aval to <vscale x 4 x i64> 139 ret <vscale x 4 x i64> %aext 140} 141 142define <vscale x 8 x i32> @masked_sload_8i8_8i32(ptr %a, <vscale x 8 x i1> %b) { 143; CHECK-LABEL: masked_sload_8i8_8i32: 144; CHECK: // %bb.0: 145; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0] 146; CHECK-NEXT: sunpklo z0.s, z1.h 147; CHECK-NEXT: sunpkhi z1.s, z1.h 148; CHECK-NEXT: ret 149 %aval = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(ptr %a, i32 16, <vscale x 8 x i1> %b, <vscale x 8 x i8> zeroinitializer) 150 %aext = sext <vscale x 8 x i8> %aval to <vscale x 8 x i32> 151 ret <vscale x 8 x i32> %aext 152} 153 154define <vscale x 8 x i64> @masked_sload_8i8_8i64(ptr %a, <vscale x 8 x i1> %b) { 155; CHECK-LABEL: masked_sload_8i8_8i64: 156; CHECK: // %bb.0: 157; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0] 158; CHECK-NEXT: sunpklo z1.s, z0.h 159; CHECK-NEXT: sunpkhi z3.s, z0.h 160; CHECK-NEXT: sunpklo z0.d, z1.s 161; CHECK-NEXT: sunpkhi z1.d, z1.s 162; CHECK-NEXT: sunpklo z2.d, z3.s 163; CHECK-NEXT: sunpkhi z3.d, z3.s 164; CHECK-NEXT: ret 165 %aval = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(ptr %a, i32 16, <vscale x 8 x i1> %b, <vscale x 8 x i8> zeroinitializer) 166 %aext = sext <vscale x 8 x i8> %aval to <vscale x 8 x i64> 167 ret <vscale x 8 x i64> %aext 168} 169 170define <vscale x 4 x i64> @masked_sload_x2_4i8_4i64(ptr %a, ptr %b, <vscale x 4 x i1> %c) { 171; CHECK-LABEL: masked_sload_x2_4i8_4i64: 172; CHECK: // %bb.0: 173; CHECK-NEXT: punpkhi p1.h, p0.b 174; CHECK-NEXT: punpklo p0.h, p0.b 175; CHECK-NEXT: ld1sb { z1.d }, p1/z, [x0, #1, mul vl] 176; CHECK-NEXT: ld1sb { z2.d }, p1/z, [x1, #1, mul vl] 177; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0] 178; CHECK-NEXT: ld1sb { z3.d }, p0/z, [x1] 179; CHECK-NEXT: add z1.d, z1.d, z2.d 180; CHECK-NEXT: add z0.d, z0.d, z3.d 181; CHECK-NEXT: ret 182 %aval = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(ptr %a, i32 16, <vscale x 4 x i1> %c, <vscale x 4 x i8> zeroinitializer) 183 %bval = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(ptr %b, i32 16, <vscale x 4 x i1> %c, <vscale x 4 x i8> zeroinitializer) 184 %aext = sext <vscale x 4 x i8> %aval to <vscale x 4 x i64> 185 %bext = sext <vscale x 4 x i8> %bval to <vscale x 4 x i64> 186 %res = add <vscale x 4 x i64> %aext, %bext 187 ret <vscale x 4 x i64> %res 188} 189 190define <vscale x 4 x i64> @masked_sload_x2_4i16_4i64(ptr %a, ptr %b, <vscale x 4 x i1> %c) { 191; CHECK-LABEL: masked_sload_x2_4i16_4i64: 192; CHECK: // %bb.0: 193; CHECK-NEXT: punpkhi p1.h, p0.b 194; CHECK-NEXT: punpklo p0.h, p0.b 195; CHECK-NEXT: ld1sh { z1.d }, p1/z, [x0, #1, mul vl] 196; CHECK-NEXT: ld1sh { z2.d }, p1/z, [x1, #1, mul vl] 197; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0] 198; CHECK-NEXT: ld1sh { z3.d }, p0/z, [x1] 199; CHECK-NEXT: add z1.d, z1.d, z2.d 200; CHECK-NEXT: add z0.d, z0.d, z3.d 201; CHECK-NEXT: ret 202 %aval = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(ptr %a, i32 16, <vscale x 4 x i1> %c, <vscale x 4 x i16> zeroinitializer) 203 %bval = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(ptr %b, i32 16, <vscale x 4 x i1> %c, <vscale x 4 x i16> zeroinitializer) 204 %aext = sext <vscale x 4 x i16> %aval to <vscale x 4 x i64> 205 %bext = sext <vscale x 4 x i16> %bval to <vscale x 4 x i64> 206 %res = add <vscale x 4 x i64> %aext, %bext 207 ret <vscale x 4 x i64> %res 208} 209 210define <vscale x 8 x i32> @masked_sload_x2_8i8_8i32(ptr %a, ptr %b, <vscale x 8 x i1> %c) { 211; CHECK-LABEL: masked_sload_x2_8i8_8i32: 212; CHECK: // %bb.0: 213; CHECK-NEXT: punpkhi p1.h, p0.b 214; CHECK-NEXT: punpklo p0.h, p0.b 215; CHECK-NEXT: ld1sb { z1.s }, p1/z, [x0, #1, mul vl] 216; CHECK-NEXT: ld1sb { z2.s }, p1/z, [x1, #1, mul vl] 217; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0] 218; CHECK-NEXT: ld1sb { z3.s }, p0/z, [x1] 219; CHECK-NEXT: add z1.s, z1.s, z2.s 220; CHECK-NEXT: add z0.s, z0.s, z3.s 221; CHECK-NEXT: ret 222 %aval = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(ptr %a, i32 16, <vscale x 8 x i1> %c, <vscale x 8 x i8> zeroinitializer) 223 %bval = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(ptr %b, i32 16, <vscale x 8 x i1> %c, <vscale x 8 x i8> zeroinitializer) 224 %aext = sext <vscale x 8 x i8> %aval to <vscale x 8 x i32> 225 %bext = sext <vscale x 8 x i8> %bval to <vscale x 8 x i32> 226 %res = add <vscale x 8 x i32> %aext, %bext 227 ret <vscale x 8 x i32> %res 228} 229 230define <vscale x 8 x i64> @masked_sload_x2_8i8_8i64(ptr %a, ptr %b, <vscale x 8 x i1> %c) { 231; CHECK-LABEL: masked_sload_x2_8i8_8i64: 232; CHECK: // %bb.0: 233; CHECK-NEXT: punpkhi p1.h, p0.b 234; CHECK-NEXT: punpklo p0.h, p0.b 235; CHECK-NEXT: punpkhi p2.h, p1.b 236; CHECK-NEXT: punpklo p1.h, p1.b 237; CHECK-NEXT: punpkhi p3.h, p0.b 238; CHECK-NEXT: ld1sb { z3.d }, p2/z, [x0, #3, mul vl] 239; CHECK-NEXT: ld1sb { z5.d }, p2/z, [x1, #3, mul vl] 240; CHECK-NEXT: punpklo p0.h, p0.b 241; CHECK-NEXT: ld1sb { z2.d }, p1/z, [x0, #2, mul vl] 242; CHECK-NEXT: ld1sb { z6.d }, p1/z, [x1, #2, mul vl] 243; CHECK-NEXT: ld1sb { z1.d }, p3/z, [x0, #1, mul vl] 244; CHECK-NEXT: ld1sb { z7.d }, p3/z, [x1, #1, mul vl] 245; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0] 246; CHECK-NEXT: ld1sb { z4.d }, p0/z, [x1] 247; CHECK-NEXT: add z3.d, z3.d, z5.d 248; CHECK-NEXT: add z2.d, z2.d, z6.d 249; CHECK-NEXT: add z1.d, z1.d, z7.d 250; CHECK-NEXT: add z0.d, z0.d, z4.d 251; CHECK-NEXT: ret 252 %aval = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(ptr %a, i32 16, <vscale x 8 x i1> %c, <vscale x 8 x i8> zeroinitializer) 253 %bval = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(ptr %b, i32 16, <vscale x 8 x i1> %c, <vscale x 8 x i8> zeroinitializer) 254 %aext = sext <vscale x 8 x i8> %aval to <vscale x 8 x i64> 255 %bext = sext <vscale x 8 x i8> %bval to <vscale x 8 x i64> 256 %res = add <vscale x 8 x i64> %aext, %bext 257 ret <vscale x 8 x i64> %res 258} 259 260 261declare <vscale x 2 x i8> @llvm.masked.load.nxv2i8(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x i8>) 262declare <vscale x 2 x i16> @llvm.masked.load.nxv2i16(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x i16>) 263declare <vscale x 2 x i32> @llvm.masked.load.nxv2i32(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x i32>) 264declare <vscale x 4 x i8> @llvm.masked.load.nxv4i8(ptr, i32, <vscale x 4 x i1>, <vscale x 4 x i8>) 265declare <vscale x 4 x i16> @llvm.masked.load.nxv4i16(ptr, i32, <vscale x 4 x i1>, <vscale x 4 x i16>) 266declare <vscale x 8 x i8> @llvm.masked.load.nxv8i8(ptr, i32, <vscale x 8 x i1>, <vscale x 8 x i8>) 267declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8(ptr, i32, <vscale x 16 x i1>, <vscale x 16 x i8>) 268