1; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve --asm-verbose=false < %s | FileCheck %s 2 3; 2-lane contiguous load/stores 4 5define void @test_masked_ldst_sv2i8(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind { 6; CHECK-LABEL: test_masked_ldst_sv2i8: 7; CHECK-NEXT: ld1b { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1] 8; CHECK-NEXT: st1b { z[[DATA]].d }, p0, [x0, x1] 9; CHECK-NEXT: ret 10 %base_i8 = getelementptr i8, ptr %base, i64 %offset 11 %data = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(ptr %base_i8, 12 i32 1, 13 <vscale x 2 x i1> %mask, 14 <vscale x 2 x i8> undef) 15 call void @llvm.masked.store.nxv2i8(<vscale x 2 x i8> %data, 16 ptr %base_i8, 17 i32 1, 18 <vscale x 2 x i1> %mask) 19 ret void 20} 21 22define void @test_masked_ldst_sv2i16(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind { 23; CHECK-LABEL: test_masked_ldst_sv2i16: 24; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #1] 25; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, x1, lsl #1] 26; CHECK-NEXT: ret 27 %base_i16 = getelementptr i16, ptr %base, i64 %offset 28 %data = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(ptr %base_i16, 29 i32 1, 30 <vscale x 2 x i1> %mask, 31 <vscale x 2 x i16> undef) 32 call void @llvm.masked.store.nxv2i16(<vscale x 2 x i16> %data, 33 ptr %base_i16, 34 i32 1, 35 <vscale x 2 x i1> %mask) 36 ret void 37} 38 39define void @test_masked_ldst_sv2i32(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind { 40; CHECK-LABEL: test_masked_ldst_sv2i32: 41; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, x1, lsl #2] 42; CHECK-NEXT: st1w { z0.d }, p0, [x0, x1, lsl #2] 43; CHECK-NEXT: ret 44 %base_i32 = getelementptr i32, ptr %base, i64 %offset 45 %data = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(ptr %base_i32, 46 i32 1, 47 <vscale x 2 x i1> %mask, 48 <vscale x 2 x i32> undef) 49 call void @llvm.masked.store.nxv2i32(<vscale x 2 x i32> %data, 50 ptr %base_i32, 51 i32 1, 52 <vscale x 2 x i1> %mask) 53 ret void 54} 55 56define void @test_masked_ldst_sv2i64(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind { 57; CHECK-LABEL: test_masked_ldst_sv2i64: 58; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x1, lsl #3] 59; CHECK-NEXT: st1d { z0.d }, p0, [x0, x1, lsl #3] 60; CHECK-NEXT: ret 61 %base_i64 = getelementptr i64, ptr %base, i64 %offset 62 %data = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64(ptr %base_i64, 63 i32 1, 64 <vscale x 2 x i1> %mask, 65 <vscale x 2 x i64> undef) 66 call void @llvm.masked.store.nxv2i64(<vscale x 2 x i64> %data, 67 ptr %base_i64, 68 i32 1, 69 <vscale x 2 x i1> %mask) 70 ret void 71} 72 73define void @test_masked_ldst_sv2f16(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind { 74; CHECK-LABEL: test_masked_ldst_sv2f16: 75; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #1] 76; CHECK-NEXT: st1h { z[[DATA]].d }, p0, [x0, x1, lsl #1] 77; CHECK-NEXT: ret 78 %base_half = getelementptr half, ptr %base, i64 %offset 79 %data = call <vscale x 2 x half> @llvm.masked.load.nxv2f16(ptr %base_half, 80 i32 1, 81 <vscale x 2 x i1> %mask, 82 <vscale x 2 x half> undef) 83 call void @llvm.masked.store.nxv2f16(<vscale x 2 x half> %data, 84 ptr %base_half, 85 i32 1, 86 <vscale x 2 x i1> %mask) 87 ret void 88} 89 90define void @test_masked_ldst_sv2f32(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind { 91; CHECK-LABEL: test_masked_ldst_sv2f32: 92; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #2] 93; CHECK-NEXT: st1w { z[[DATA]].d }, p0, [x0, x1, lsl #2] 94; CHECK-NEXT: ret 95 %base_float = getelementptr float, ptr %base, i64 %offset 96 %data = call <vscale x 2 x float> @llvm.masked.load.nxv2f32(ptr %base_float, 97 i32 1, 98 <vscale x 2 x i1> %mask, 99 <vscale x 2 x float> undef) 100 call void @llvm.masked.store.nxv2f32(<vscale x 2 x float> %data, 101 ptr %base_float, 102 i32 1, 103 <vscale x 2 x i1> %mask) 104 ret void 105} 106 107define void @test_masked_ldst_sv2f64(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind { 108; CHECK-LABEL: test_masked_ldst_sv2f64: 109; CHECK-NEXT: ld1d { z[[DATA:[0-9]+]].d }, p0/z, [x0, x1, lsl #3] 110; CHECK-NEXT: st1d { z[[DATA]].d }, p0, [x0, x1, lsl #3] 111; CHECK-NEXT: ret 112 %base_double = getelementptr double, ptr %base, i64 %offset 113 %data = call <vscale x 2 x double> @llvm.masked.load.nxv2f64(ptr %base_double, 114 i32 1, 115 <vscale x 2 x i1> %mask, 116 <vscale x 2 x double> undef) 117 call void @llvm.masked.store.nxv2f64(<vscale x 2 x double> %data, 118 ptr %base_double, 119 i32 1, 120 <vscale x 2 x i1> %mask) 121 ret void 122} 123 124; 2-lane zero/sign extended contiguous loads. 125 126define <vscale x 2 x i64> @masked_zload_sv2i8_to_sv2i64(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind { 127; CHECK-LABEL: masked_zload_sv2i8_to_sv2i64: 128; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, x1] 129; CHECK-NEXT: ret 130 %base_i8 = getelementptr i8, ptr %base, i64 %offset 131 %load = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(ptr %base_i8, 132 i32 1, 133 <vscale x 2 x i1> %mask, 134 <vscale x 2 x i8> undef) 135 %ext = zext <vscale x 2 x i8> %load to <vscale x 2 x i64> 136 ret <vscale x 2 x i64> %ext 137} 138 139define <vscale x 2 x i64> @masked_sload_sv2i8_to_sv2i64(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind { 140; CHECK-LABEL: masked_sload_sv2i8_to_sv2i64: 141; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, x1] 142; CHECK-NEXT: ret 143 %base_i8 = getelementptr i8, ptr %base, i64 %offset 144 %load = call <vscale x 2 x i8> @llvm.masked.load.nxv2i8(ptr %base_i8, 145 i32 1, 146 <vscale x 2 x i1> %mask, 147 <vscale x 2 x i8> undef) 148 %ext = sext <vscale x 2 x i8> %load to <vscale x 2 x i64> 149 ret <vscale x 2 x i64> %ext 150} 151 152define <vscale x 2 x i64> @masked_zload_sv2i16_to_sv2i64(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind { 153; CHECK-LABEL: masked_zload_sv2i16_to_sv2i64: 154; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, x1, lsl #1] 155; CHECK-NEXT: ret 156 %base_i16 = getelementptr i16, ptr %base, i64 %offset 157 %load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(ptr %base_i16, 158 i32 1, 159 <vscale x 2 x i1> %mask, 160 <vscale x 2 x i16> undef) 161 %ext = zext <vscale x 2 x i16> %load to <vscale x 2 x i64> 162 ret <vscale x 2 x i64> %ext 163} 164 165define <vscale x 2 x i64> @masked_sload_sv2i16_to_sv2i64(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind { 166; CHECK-LABEL: masked_sload_sv2i16_to_sv2i64: 167; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, x1, lsl #1] 168; CHECK-NEXT: ret 169 %base_i16 = getelementptr i16, ptr %base, i64 %offset 170 %load = call <vscale x 2 x i16> @llvm.masked.load.nxv2i16(ptr %base_i16, 171 i32 1, 172 <vscale x 2 x i1> %mask, 173 <vscale x 2 x i16> undef) 174 %ext = sext <vscale x 2 x i16> %load to <vscale x 2 x i64> 175 ret <vscale x 2 x i64> %ext 176} 177 178 179define <vscale x 2 x i64> @masked_zload_sv2i32_to_sv2i64(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind { 180; CHECK-LABEL: masked_zload_sv2i32_to_sv2i64: 181; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, x1, lsl #2] 182; CHECK-NEXT: ret 183 %base_i32 = getelementptr i32, ptr %base, i64 %offset 184 %load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(ptr %base_i32, 185 i32 1, 186 <vscale x 2 x i1> %mask, 187 <vscale x 2 x i32> undef) 188 %ext = zext <vscale x 2 x i32> %load to <vscale x 2 x i64> 189 ret <vscale x 2 x i64> %ext 190} 191 192define <vscale x 2 x i64> @masked_sload_sv2i32_to_sv2i64(ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind { 193; CHECK-LABEL: masked_sload_sv2i32_to_sv2i64: 194; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, x1, lsl #2] 195; CHECK-NEXT: ret 196 %base_i32 = getelementptr i32, ptr %base, i64 %offset 197 %load = call <vscale x 2 x i32> @llvm.masked.load.nxv2i32(ptr %base_i32, 198 i32 1, 199 <vscale x 2 x i1> %mask, 200 <vscale x 2 x i32> undef) 201 %ext = sext <vscale x 2 x i32> %load to <vscale x 2 x i64> 202 ret <vscale x 2 x i64> %ext 203} 204 205; 2-lane truncating contiguous stores. 206 207define void @masked_trunc_store_sv2i64_to_sv2i8(<vscale x 2 x i64> %val, ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind { 208; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i8: 209; CHECK-NEXT: st1b { z0.d }, p0, [x0, x1] 210; CHECK-NEXT: ret 211 %base_i8 = getelementptr i8, ptr %base, i64 %offset 212 %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i8> 213 call void @llvm.masked.store.nxv2i8(<vscale x 2 x i8> %trunc, 214 ptr %base_i8, 215 i32 1, 216 <vscale x 2 x i1> %mask) 217 ret void 218} 219 220define void @masked_trunc_store_sv2i64_to_sv2i16(<vscale x 2 x i64> %val, ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind { 221; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i16: 222; CHECK-NEXT: st1h { z0.d }, p0, [x0, x1, lsl #1] 223; CHECK-NEXT: ret 224 %base_i16 = getelementptr i16, ptr %base, i64 %offset 225 %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i16> 226 call void @llvm.masked.store.nxv2i16(<vscale x 2 x i16> %trunc, 227 ptr %base_i16, 228 i32 1, 229 <vscale x 2 x i1> %mask) 230 ret void 231} 232 233define void @masked_trunc_store_sv2i64_to_sv2i32(<vscale x 2 x i64> %val, ptr %base, <vscale x 2 x i1> %mask, i64 %offset) nounwind { 234; CHECK-LABEL: masked_trunc_store_sv2i64_to_sv2i32: 235; CHECK-NEXT: st1w { z0.d }, p0, [x0, x1, lsl #2] 236; CHECK-NEXT: ret 237 %base_i32 = getelementptr i32, ptr %base, i64 %offset 238 %trunc = trunc <vscale x 2 x i64> %val to <vscale x 2 x i32> 239 call void @llvm.masked.store.nxv2i32(<vscale x 2 x i32> %trunc, 240 ptr %base_i32, 241 i32 1, 242 <vscale x 2 x i1> %mask) 243 ret void 244} 245 246; 4-lane contiguous load/stores. 247 248define void @test_masked_ldst_sv4i8(ptr %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind { 249; CHECK-LABEL: test_masked_ldst_sv4i8: 250; CHECK-NEXT: ld1b { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1] 251; CHECK-NEXT: st1b { z[[DATA]].s }, p0, [x0, x1] 252; CHECK-NEXT: ret 253 %base_i8 = getelementptr i8, ptr %base, i64 %offset 254 %data = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(ptr %base_i8, 255 i32 1, 256 <vscale x 4 x i1> %mask, 257 <vscale x 4 x i8> undef) 258 call void @llvm.masked.store.nxv4i8(<vscale x 4 x i8> %data, 259 ptr %base_i8, 260 i32 1, 261 <vscale x 4 x i1> %mask) 262 ret void 263} 264 265define void @test_masked_ldst_sv4i16(ptr %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind { 266; CHECK-LABEL: test_masked_ldst_sv4i16: 267; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #1] 268; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, x1, lsl #1] 269; CHECK-NEXT: ret 270 %base_i16 = getelementptr i16, ptr %base, i64 %offset 271 %data = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(ptr %base_i16, 272 i32 1, 273 <vscale x 4 x i1> %mask, 274 <vscale x 4 x i16> undef) 275 call void @llvm.masked.store.nxv4i16(<vscale x 4 x i16> %data, 276 ptr %base_i16, 277 i32 1, 278 <vscale x 4 x i1> %mask) 279 ret void 280} 281 282define void @test_masked_ldst_sv4i32(ptr %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind { 283; CHECK-LABEL: test_masked_ldst_sv4i32: 284; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2] 285; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, x1, lsl #2] 286; CHECK-NEXT: ret 287 %base_i32 = getelementptr i32, ptr %base, i64 %offset 288 %data = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32(ptr %base_i32, 289 i32 1, 290 <vscale x 4 x i1> %mask, 291 <vscale x 4 x i32> undef) 292 call void @llvm.masked.store.nxv4i32(<vscale x 4 x i32> %data, 293 ptr %base_i32, 294 i32 1, 295 <vscale x 4 x i1> %mask) 296 ret void 297} 298 299define void @test_masked_ldst_sv4f16(ptr %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind { 300; CHECK-LABEL: test_masked_ldst_sv4f16: 301; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #1] 302; CHECK-NEXT: st1h { z[[DATA]].s }, p0, [x0, x1, lsl #1] 303; CHECK-NEXT: ret 304 %base_f16 = getelementptr half, ptr %base, i64 %offset 305 %data = call <vscale x 4 x half> @llvm.masked.load.nxv4f16(ptr %base_f16, 306 i32 1, 307 <vscale x 4 x i1> %mask, 308 <vscale x 4 x half> undef) 309 call void @llvm.masked.store.nxv4f16(<vscale x 4 x half> %data, 310 ptr %base_f16, 311 i32 1, 312 <vscale x 4 x i1> %mask) 313 ret void 314} 315 316define void @test_masked_ldst_sv4f32(ptr %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind { 317; CHECK-LABEL: test_masked_ldst_sv4f32: 318; CHECK-NEXT: ld1w { z[[DATA:[0-9]+]].s }, p0/z, [x0, x1, lsl #2] 319; CHECK-NEXT: st1w { z[[DATA]].s }, p0, [x0, x1, lsl #2] 320; CHECK-NEXT: ret 321 %base_f32 = getelementptr float, ptr %base, i64 %offset 322 %data = call <vscale x 4 x float> @llvm.masked.load.nxv4f32(ptr %base_f32, 323 i32 1, 324 <vscale x 4 x i1> %mask, 325 <vscale x 4 x float> undef) 326 call void @llvm.masked.store.nxv4f32(<vscale x 4 x float> %data, 327 ptr %base_f32, 328 i32 1, 329 <vscale x 4 x i1> %mask) 330 ret void 331} 332 333; 4-lane zero/sign extended contiguous loads. 334 335define <vscale x 4 x i32> @masked_zload_sv4i8_to_sv4i32(ptr %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind { 336; CHECK-LABEL: masked_zload_sv4i8_to_sv4i32: 337; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, x1] 338; CHECK-NEXT: ret 339 %base_i8 = getelementptr i8, ptr %base, i64 %offset 340 %load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(ptr %base_i8, 341 i32 1, 342 <vscale x 4 x i1> %mask, 343 <vscale x 4 x i8> undef) 344 %ext = zext <vscale x 4 x i8> %load to <vscale x 4 x i32> 345 ret <vscale x 4 x i32> %ext 346} 347 348define <vscale x 4 x i32> @masked_sload_sv4i8_to_sv4i32(ptr %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind { 349; CHECK-LABEL: masked_sload_sv4i8_to_sv4i32: 350; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, x1] 351; CHECK-NEXT: ret 352 %base_i8 = getelementptr i8, ptr %base, i64 %offset 353 %load = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8(ptr %base_i8, 354 i32 1, 355 <vscale x 4 x i1> %mask, 356 <vscale x 4 x i8> undef) 357 %ext = sext <vscale x 4 x i8> %load to <vscale x 4 x i32> 358 ret <vscale x 4 x i32> %ext 359} 360 361define <vscale x 4 x i32> @masked_zload_sv4i16_to_sv4i32(ptr %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind { 362; CHECK-LABEL: masked_zload_sv4i16_to_sv4i32: 363; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, x1, lsl #1] 364; CHECK-NEXT: ret 365 %base_i16 = getelementptr i16, ptr %base, i64 %offset 366 %load = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(ptr %base_i16, 367 i32 1, 368 <vscale x 4 x i1> %mask, 369 <vscale x 4 x i16> undef) 370 %ext = zext <vscale x 4 x i16> %load to <vscale x 4 x i32> 371 ret <vscale x 4 x i32> %ext 372} 373 374define <vscale x 4 x i32> @masked_sload_sv4i16_to_sv4i32(ptr %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind { 375; CHECK-LABEL: masked_sload_sv4i16_to_sv4i32: 376; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, x1, lsl #1] 377; CHECK-NEXT: ret 378 %base_i16 = getelementptr i16, ptr %base, i64 %offset 379 %load = call <vscale x 4 x i16> @llvm.masked.load.nxv4i16(ptr %base_i16, 380 i32 1, 381 <vscale x 4 x i1> %mask, 382 <vscale x 4 x i16> undef) 383 %ext = sext <vscale x 4 x i16> %load to <vscale x 4 x i32> 384 ret <vscale x 4 x i32> %ext 385} 386 387; 4-lane truncating contiguous stores. 388 389define void @masked_trunc_store_sv4i32_to_sv4i8(<vscale x 4 x i32> %val, ptr %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind { 390; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i8: 391; CHECK-NEXT: st1b { z0.s }, p0, [x0, x1] 392; CHECK-NEXT: ret 393 %base_i8 = getelementptr i8, ptr %base, i64 %offset 394 %trunc = trunc <vscale x 4 x i32> %val to <vscale x 4 x i8> 395 call void @llvm.masked.store.nxv4i8(<vscale x 4 x i8> %trunc, 396 ptr %base_i8, 397 i32 1, 398 <vscale x 4 x i1> %mask) 399 ret void 400} 401 402define void @masked_trunc_store_sv4i32_to_sv4i16(<vscale x 4 x i32> %val, ptr %base, <vscale x 4 x i1> %mask, i64 %offset) nounwind { 403; CHECK-LABEL: masked_trunc_store_sv4i32_to_sv4i16: 404; CHECK-NEXT: st1h { z0.s }, p0, [x0, x1, lsl #1] 405; CHECK-NEXT: ret 406 %base_i16 = getelementptr i16, ptr %base, i64 %offset 407 %trunc = trunc <vscale x 4 x i32> %val to <vscale x 4 x i16> 408 call void @llvm.masked.store.nxv4i16(<vscale x 4 x i16> %trunc, 409 ptr %base_i16, 410 i32 1, 411 <vscale x 4 x i1> %mask) 412 ret void 413} 414 415; 8-lane contiguous load/stores. 416 417define void @test_masked_ldst_sv8i8(ptr %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind { 418; CHECK-LABEL: test_masked_ldst_sv8i8: 419; CHECK-NEXT: ld1b { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1] 420; CHECK-NEXT: st1b { z[[DATA]].h }, p0, [x0, x1] 421; CHECK-NEXT: ret 422 %base_i8 = getelementptr i8, ptr %base, i64 %offset 423 %data = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(ptr %base_i8, 424 i32 1, 425 <vscale x 8 x i1> %mask, 426 <vscale x 8 x i8> undef) 427 call void @llvm.masked.store.nxv8i8(<vscale x 8 x i8> %data, 428 ptr %base_i8, 429 i32 1, 430 <vscale x 8 x i1> %mask) 431 ret void 432} 433 434define void @test_masked_ldst_sv8i16(ptr %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind { 435; CHECK-LABEL: test_masked_ldst_sv8i16: 436; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1] 437; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1] 438; CHECK-NEXT: ret 439 %base_i16 = getelementptr i16, ptr %base, i64 %offset 440 %data = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16(ptr %base_i16, 441 i32 1, 442 <vscale x 8 x i1> %mask, 443 <vscale x 8 x i16> undef) 444 call void @llvm.masked.store.nxv8i16(<vscale x 8 x i16> %data, 445 ptr %base_i16, 446 i32 1, 447 <vscale x 8 x i1> %mask) 448 ret void 449} 450 451define void @test_masked_ldst_sv8f16(ptr %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind { 452; CHECK-LABEL: test_masked_ldst_sv8f16: 453; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1] 454; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1] 455; CHECK-NEXT: ret 456 %base_f16 = getelementptr half, ptr %base, i64 %offset 457 %data = call <vscale x 8 x half> @llvm.masked.load.nxv8f16(ptr %base_f16, 458 i32 1, 459 <vscale x 8 x i1> %mask, 460 <vscale x 8 x half> undef) 461 call void @llvm.masked.store.nxv8f16(<vscale x 8 x half> %data, 462 ptr %base_f16, 463 i32 1, 464 <vscale x 8 x i1> %mask) 465 ret void 466} 467 468define void @test_masked_ldst_sv8bf16(ptr %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind #0 { 469; CHECK-LABEL: test_masked_ldst_sv8bf16: 470; CHECK-NEXT: ld1h { z[[DATA:[0-9]+]].h }, p0/z, [x0, x1, lsl #1] 471; CHECK-NEXT: st1h { z[[DATA]].h }, p0, [x0, x1, lsl #1] 472; CHECK-NEXT: ret 473 %base_f16 = getelementptr bfloat, ptr %base, i64 %offset 474 %data = call <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(ptr %base_f16, 475 i32 1, 476 <vscale x 8 x i1> %mask, 477 <vscale x 8 x bfloat> undef) 478 call void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat> %data, 479 ptr %base_f16, 480 i32 1, 481 <vscale x 8 x i1> %mask) 482 ret void 483} 484 485; 8-lane zero/sign extended contiguous loads. 486 487define <vscale x 8 x i16> @masked_zload_sv8i8_to_sv8i16(ptr %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind { 488; CHECK-LABEL: masked_zload_sv8i8_to_sv8i16: 489; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0, x1] 490; CHECK-NEXT: ret 491 %base_i8 = getelementptr i8, ptr %base, i64 %offset 492 %load = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(ptr %base_i8, 493 i32 1, 494 <vscale x 8 x i1> %mask, 495 <vscale x 8 x i8> undef) 496 %ext = zext <vscale x 8 x i8> %load to <vscale x 8 x i16> 497 ret <vscale x 8 x i16> %ext 498} 499 500define <vscale x 8 x i16> @masked_sload_sv8i8_to_sv8i16(ptr %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind { 501; CHECK-LABEL: masked_sload_sv8i8_to_sv8i16: 502; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0, x1] 503; CHECK-NEXT: ret 504 %base_i8 = getelementptr i8, ptr %base, i64 %offset 505 %load = call <vscale x 8 x i8> @llvm.masked.load.nxv8i8(ptr %base_i8, 506 i32 1, 507 <vscale x 8 x i1> %mask, 508 <vscale x 8 x i8> undef) 509 %ext = sext <vscale x 8 x i8> %load to <vscale x 8 x i16> 510 ret <vscale x 8 x i16> %ext 511} 512 513; 8-lane truncating contiguous stores. 514 515define void @masked_trunc_store_sv8i16_to_sv8i8(<vscale x 8 x i16> %val, ptr %base, <vscale x 8 x i1> %mask, i64 %offset) nounwind { 516; CHECK-LABEL: masked_trunc_store_sv8i16_to_sv8i8: 517; CHECK-NEXT: st1b { z0.h }, p0, [x0, x1] 518; CHECK-NEXT: ret 519 %base_i8 = getelementptr i8, ptr %base, i64 %offset 520 %trunc = trunc <vscale x 8 x i16> %val to <vscale x 8 x i8> 521 call void @llvm.masked.store.nxv8i8(<vscale x 8 x i8> %trunc, 522 ptr %base_i8, 523 i32 1, 524 <vscale x 8 x i1> %mask) 525 ret void 526} 527 528; 16-lane contiguous load/stores. 529 530define void @test_masked_ldst_sv16i8(ptr %base, <vscale x 16 x i1> %mask, i64 %offset) nounwind { 531; CHECK-LABEL: test_masked_ldst_sv16i8: 532; CHECK-NEXT: ld1b { z[[DATA:[0-9]+]].b }, p0/z, [x0, x1] 533; CHECK-NEXT: st1b { z[[DATA]].b }, p0, [x0, x1] 534; CHECK-NEXT: ret 535 %base_i8 = getelementptr i8, ptr %base, i64 %offset 536 %data = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8(ptr %base_i8, 537 i32 1, 538 <vscale x 16 x i1> %mask, 539 <vscale x 16 x i8> undef) 540 call void @llvm.masked.store.nxv16i8(<vscale x 16 x i8> %data, 541 ptr %base_i8, 542 i32 1, 543 <vscale x 16 x i1> %mask) 544 ret void 545} 546 547; 2-element contiguous loads. 548declare <vscale x 2 x i8> @llvm.masked.load.nxv2i8 (ptr , i32, <vscale x 2 x i1>, <vscale x 2 x i8> ) 549declare <vscale x 2 x i16> @llvm.masked.load.nxv2i16(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x i16>) 550declare <vscale x 2 x i32> @llvm.masked.load.nxv2i32(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x i32>) 551declare <vscale x 2 x i64> @llvm.masked.load.nxv2i64(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x i64>) 552declare <vscale x 2 x half> @llvm.masked.load.nxv2f16(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x half>) 553declare <vscale x 2 x float> @llvm.masked.load.nxv2f32(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x float>) 554declare <vscale x 2 x double> @llvm.masked.load.nxv2f64(ptr, i32, <vscale x 2 x i1>, <vscale x 2 x double>) 555 556; 4-element contiguous loads. 557declare <vscale x 4 x i8> @llvm.masked.load.nxv4i8 (ptr , i32, <vscale x 4 x i1>, <vscale x 4 x i8> ) 558declare <vscale x 4 x i16> @llvm.masked.load.nxv4i16(ptr, i32, <vscale x 4 x i1>, <vscale x 4 x i16>) 559declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32(ptr, i32, <vscale x 4 x i1>, <vscale x 4 x i32>) 560declare <vscale x 4 x half> @llvm.masked.load.nxv4f16(ptr, i32, <vscale x 4 x i1>, <vscale x 4 x half>) 561declare <vscale x 4 x float> @llvm.masked.load.nxv4f32(ptr, i32, <vscale x 4 x i1>, <vscale x 4 x float>) 562 563; 8-element contiguous loads. 564declare <vscale x 8 x i8> @llvm.masked.load.nxv8i8 (ptr , i32, <vscale x 8 x i1>, <vscale x 8 x i8> ) 565declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16(ptr, i32, <vscale x 8 x i1>, <vscale x 8 x i16>) 566declare <vscale x 8 x half> @llvm.masked.load.nxv8f16(ptr, i32, <vscale x 8 x i1>, <vscale x 8 x half>) 567declare <vscale x 8 x bfloat> @llvm.masked.load.nxv8bf16(ptr, i32, <vscale x 8 x i1>, <vscale x 8 x bfloat>) 568 569; 16-element contiguous loads. 570declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8(ptr, i32, <vscale x 16 x i1>, <vscale x 16 x i8>) 571 572; 2-element contiguous stores. 573declare void @llvm.masked.store.nxv2i8 (<vscale x 2 x i8> , ptr , i32, <vscale x 2 x i1>) 574declare void @llvm.masked.store.nxv2i16(<vscale x 2 x i16>, ptr, i32, <vscale x 2 x i1>) 575declare void @llvm.masked.store.nxv2i32(<vscale x 2 x i32>, ptr, i32, <vscale x 2 x i1>) 576declare void @llvm.masked.store.nxv2i64(<vscale x 2 x i64>, ptr, i32, <vscale x 2 x i1>) 577declare void @llvm.masked.store.nxv2f16(<vscale x 2 x half>, ptr, i32, <vscale x 2 x i1>) 578declare void @llvm.masked.store.nxv2f32(<vscale x 2 x float>, ptr, i32, <vscale x 2 x i1>) 579declare void @llvm.masked.store.nxv2f64(<vscale x 2 x double>, ptr, i32, <vscale x 2 x i1>) 580 581; 4-element contiguous stores. 582declare void @llvm.masked.store.nxv4i8 (<vscale x 4 x i8> , ptr , i32, <vscale x 4 x i1>) 583declare void @llvm.masked.store.nxv4i16(<vscale x 4 x i16>, ptr, i32, <vscale x 4 x i1>) 584declare void @llvm.masked.store.nxv4i32(<vscale x 4 x i32>, ptr, i32, <vscale x 4 x i1>) 585declare void @llvm.masked.store.nxv4f16(<vscale x 4 x half>, ptr, i32, <vscale x 4 x i1>) 586declare void @llvm.masked.store.nxv4f32(<vscale x 4 x float>, ptr, i32, <vscale x 4 x i1>) 587 588; 8-element contiguous stores. 589declare void @llvm.masked.store.nxv8i8 (<vscale x 8 x i8> , ptr , i32, <vscale x 8 x i1>) 590declare void @llvm.masked.store.nxv8i16(<vscale x 8 x i16>, ptr, i32, <vscale x 8 x i1>) 591declare void @llvm.masked.store.nxv8f16(<vscale x 8 x half>, ptr, i32, <vscale x 8 x i1>) 592declare void @llvm.masked.store.nxv8bf16(<vscale x 8 x bfloat>, ptr, i32, <vscale x 8 x i1>) 593 594; 16-element contiguous stores. 595declare void @llvm.masked.store.nxv16i8(<vscale x 16 x i8>, ptr, i32, <vscale x 16 x i1>) 596 597; +bf16 is required for the bfloat version. 598attributes #0 = { "target-features"="+sve,+bf16" } 599