1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+bf16 < %s | FileCheck %s 3 4; 5; LD1RQB 6; 7 8define <vscale x 16 x i8> @ld1rqb_i8(<vscale x 16 x i1> %pred, ptr %addr) { 9; CHECK-LABEL: ld1rqb_i8: 10; CHECK: // %bb.0: 11; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x0] 12; CHECK-NEXT: ret 13 %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> %pred, ptr %addr) 14 ret <vscale x 16 x i8> %res 15} 16 17define <vscale x 16 x i8> @ld1rqb_i8_imm(<vscale x 16 x i1> %pred, ptr %addr) { 18; CHECK-LABEL: ld1rqb_i8_imm: 19; CHECK: // %bb.0: 20; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x0, #16] 21; CHECK-NEXT: ret 22 %ptr = getelementptr inbounds i8, ptr %addr, i8 16 23 %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> %pred, ptr %ptr) 24 ret <vscale x 16 x i8> %res 25} 26 27define <vscale x 16 x i8> @ld1rqb_i8_scalar(<vscale x 16 x i1> %pred, ptr %addr, i64 %idx) { 28; CHECK-LABEL: ld1rqb_i8_scalar: 29; CHECK: // %bb.0: 30; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x0, x1] 31; CHECK-NEXT: ret 32 %ptr = getelementptr inbounds i8, ptr %addr, i64 %idx 33 %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> %pred, ptr %ptr) 34 ret <vscale x 16 x i8> %res 35} 36 37define <vscale x 16 x i8> @ld1rqb_i8_imm_lower_bound(<vscale x 16 x i1> %pred, ptr %addr) { 38; CHECK-LABEL: ld1rqb_i8_imm_lower_bound: 39; CHECK: // %bb.0: 40; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x0, #-128] 41; CHECK-NEXT: ret 42 %ptr = getelementptr inbounds i8, ptr %addr, i8 -128 43 %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> %pred, ptr %ptr) 44 ret <vscale x 16 x i8> %res 45} 46 47define <vscale x 16 x i8> @ld1rqb_i8_imm_upper_bound(<vscale x 16 x i1> %pred, ptr %addr) { 48; CHECK-LABEL: ld1rqb_i8_imm_upper_bound: 49; CHECK: // %bb.0: 50; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x0, #112] 51; CHECK-NEXT: ret 52 %ptr = getelementptr inbounds i8, ptr %addr, i8 112 53 %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> %pred, ptr %ptr) 54 ret <vscale x 16 x i8> %res 55} 56 57define <vscale x 16 x i8> @ld1rqb_i8_imm_out_of_lower_bound(<vscale x 16 x i1> %pred, ptr %addr) { 58; CHECK-LABEL: ld1rqb_i8_imm_out_of_lower_bound: 59; CHECK: // %bb.0: 60; CHECK-NEXT: mov x8, #-129 61; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x0, x8] 62; CHECK-NEXT: ret 63 %ptr = getelementptr inbounds i8, ptr %addr, i64 -129 64 %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> %pred, ptr %ptr) 65 ret <vscale x 16 x i8> %res 66} 67 68define <vscale x 16 x i8> @ld1rqb_i8_imm_out_of_upper_bound(<vscale x 16 x i1> %pred, ptr %addr) { 69; CHECK-LABEL: ld1rqb_i8_imm_out_of_upper_bound: 70; CHECK: // %bb.0: 71; CHECK-NEXT: mov w8, #113 72; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x0, x8] 73; CHECK-NEXT: ret 74 %ptr = getelementptr inbounds i8, ptr %addr, i64 113 75 %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1> %pred, ptr %ptr) 76 ret <vscale x 16 x i8> %res 77} 78 79define <vscale x 16 x i8> @ld1rqb_i8_imm_dupqlane(<vscale x 8 x i1> %pred, ptr %addr) { 80; CHECK-LABEL: ld1rqb_i8_imm_dupqlane: 81; CHECK: // %bb.0: 82; CHECK-NEXT: ptrue p0.b 83; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x0, #-16] 84; CHECK-NEXT: ret 85 %ptr = getelementptr inbounds <16 x i8>, ptr %addr, i16 -1 86 %load = load <16 x i8>, ptr %ptr 87 %1 = tail call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> %load, i64 0) 88 %2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8> %1, i64 0) 89 ret <vscale x 16 x i8> %2 90} 91 92define <vscale x 16 x i8> @ld1rqb_i8_scalar_dupqlane(<vscale x 8 x i1> %pred, ptr %addr, i64 %idx) { 93; CHECK-LABEL: ld1rqb_i8_scalar_dupqlane: 94; CHECK: // %bb.0: 95; CHECK-NEXT: ptrue p0.b 96; CHECK-NEXT: ld1rqb { z0.b }, p0/z, [x0, x1] 97; CHECK-NEXT: ret 98 %ptr = getelementptr inbounds i8, ptr %addr, i64 %idx 99 %load = load <16 x i8>, ptr %ptr 100 %1 = tail call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> undef, <16 x i8> %load, i64 0) 101 %2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8> %1, i64 0) 102 ret <vscale x 16 x i8> %2 103} 104 105; 106; LD1RQH 107; 108 109define <vscale x 8 x i16> @ld1rqh_i16(<vscale x 8 x i1> %pred, ptr %addr) { 110; CHECK-LABEL: ld1rqh_i16: 111; CHECK: // %bb.0: 112; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0] 113; CHECK-NEXT: ret 114 %res = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1rq.nxv8i16(<vscale x 8 x i1> %pred, ptr %addr) 115 ret <vscale x 8 x i16> %res 116} 117 118define <vscale x 8 x half> @ld1rqh_f16(<vscale x 8 x i1> %pred, ptr %addr) { 119; CHECK-LABEL: ld1rqh_f16: 120; CHECK: // %bb.0: 121; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0] 122; CHECK-NEXT: ret 123 %res = call <vscale x 8 x half> @llvm.aarch64.sve.ld1rq.nxv8f16(<vscale x 8 x i1> %pred, ptr %addr) 124 ret <vscale x 8 x half> %res 125} 126 127define <vscale x 8 x i16> @ld1rqh_i16_imm(<vscale x 8 x i1> %pred, ptr %addr) { 128; CHECK-LABEL: ld1rqh_i16_imm: 129; CHECK: // %bb.0: 130; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0, #-64] 131; CHECK-NEXT: ret 132 %ptr = getelementptr inbounds i16, ptr %addr, i16 -32 133 %res = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1rq.nxv8i16(<vscale x 8 x i1> %pred, ptr %ptr) 134 ret <vscale x 8 x i16> %res 135} 136 137define <vscale x 8 x half> @ld1rqh_f16_imm(<vscale x 8 x i1> %pred, ptr %addr) { 138; CHECK-LABEL: ld1rqh_f16_imm: 139; CHECK: // %bb.0: 140; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0, #-16] 141; CHECK-NEXT: ret 142 %ptr = getelementptr inbounds half, ptr %addr, i16 -8 143 %res = call <vscale x 8 x half> @llvm.aarch64.sve.ld1rq.nxv8f16(<vscale x 8 x i1> %pred, ptr %ptr) 144 ret <vscale x 8 x half> %res 145} 146 147define <vscale x 8 x i16> @ld1rqh_i16_scalar(<vscale x 8 x i1> %pred, ptr %addr, i64 %idx) { 148; CHECK-LABEL: ld1rqh_i16_scalar: 149; CHECK: // %bb.0: 150; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1] 151; CHECK-NEXT: ret 152 %ptr = getelementptr inbounds i16, ptr %addr, i64 %idx 153 %res = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1rq.nxv8i16(<vscale x 8 x i1> %pred, ptr %ptr) 154 ret <vscale x 8 x i16> %res 155} 156 157define <vscale x 8 x half> @ld1rqh_f16_scalar(<vscale x 8 x i1> %pred, ptr %addr, i64 %idx) { 158; CHECK-LABEL: ld1rqh_f16_scalar: 159; CHECK: // %bb.0: 160; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1] 161; CHECK-NEXT: ret 162 %ptr = getelementptr inbounds half, ptr %addr, i64 %idx 163 %res = call <vscale x 8 x half> @llvm.aarch64.sve.ld1rq.nxv8f16(<vscale x 8 x i1> %pred, ptr %ptr) 164 ret <vscale x 8 x half> %res 165} 166 167define <vscale x 8 x bfloat> @ld1rqh_bf16(<vscale x 8 x i1> %pred, ptr %addr) { 168; CHECK-LABEL: ld1rqh_bf16: 169; CHECK: // %bb.0: 170; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0] 171; CHECK-NEXT: ret 172 %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1rq.nxv8bf16(<vscale x 8 x i1> %pred, ptr %addr) 173 ret <vscale x 8 x bfloat> %res 174} 175 176define <vscale x 8 x bfloat> @ld1rqh_bf16_imm(<vscale x 8 x i1> %pred, ptr %addr) { 177; CHECK-LABEL: ld1rqh_bf16_imm: 178; CHECK: // %bb.0: 179; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0, #-16] 180; CHECK-NEXT: ret 181 %ptr = getelementptr inbounds bfloat, ptr %addr, i16 -8 182 %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1rq.nxv8bf16(<vscale x 8 x i1> %pred, ptr %ptr) 183 ret <vscale x 8 x bfloat> %res 184} 185 186define <vscale x 8 x bfloat> @ld1rqh_bf16_scalar(<vscale x 8 x i1> %pred, ptr %addr, i64 %idx) { 187; CHECK-LABEL: ld1rqh_bf16_scalar: 188; CHECK: // %bb.0: 189; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1] 190; CHECK-NEXT: ret 191 %ptr = getelementptr inbounds bfloat, ptr %addr, i64 %idx 192 %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1rq.nxv8bf16(<vscale x 8 x i1> %pred, ptr %ptr) 193 ret <vscale x 8 x bfloat> %res 194} 195 196define <vscale x 8 x i16> @ld1rqh_i16_imm_dupqlane(<vscale x 8 x i1> %pred, ptr %addr) { 197; CHECK-LABEL: ld1rqh_i16_imm_dupqlane: 198; CHECK: // %bb.0: 199; CHECK-NEXT: ptrue p0.h 200; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0, #-16] 201; CHECK-NEXT: ret 202 %ptr = getelementptr inbounds <8 x i16>, ptr %addr, i16 -1 203 %load = load <8 x i16>, ptr %ptr 204 %1 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> %load, i64 0) 205 %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16> %1, i64 0) 206 ret <vscale x 8 x i16> %2 207} 208 209define <vscale x 8 x i16> @ld1rqh_i16_scalar_dupqlane(<vscale x 8 x i1> %pred, ptr %addr, i64 %idx) { 210; CHECK-LABEL: ld1rqh_i16_scalar_dupqlane: 211; CHECK: // %bb.0: 212; CHECK-NEXT: ptrue p0.h 213; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1] 214; CHECK-NEXT: ret 215 %ptr = getelementptr inbounds i16, ptr %addr, i64 %idx 216 %load = load <8 x i16>, ptr %ptr 217 %1 = tail call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16> undef, <8 x i16> %load, i64 0) 218 %2 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16> %1, i64 0) 219 ret <vscale x 8 x i16> %2 220} 221 222define <vscale x 8 x half> @ld1rqh_f16_imm_dupqlane(<vscale x 8 x i1> %pred, ptr %addr) { 223; CHECK-LABEL: ld1rqh_f16_imm_dupqlane: 224; CHECK: // %bb.0: 225; CHECK-NEXT: ptrue p0.h 226; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0, #-16] 227; CHECK-NEXT: ret 228 %ptr = getelementptr inbounds <8 x half>, ptr %addr, i16 -1 229 %load = load <8 x half>, ptr %ptr 230 %1 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> undef, <8 x half> %load, i64 0) 231 %2 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %1, i64 0) 232 ret <vscale x 8 x half> %2 233} 234 235define <vscale x 8 x half> @ld1rqh_f16_scalar_dupqlane(<vscale x 8 x i1> %pred, ptr %addr, i64 %idx) { 236; CHECK-LABEL: ld1rqh_f16_scalar_dupqlane: 237; CHECK: // %bb.0: 238; CHECK-NEXT: ptrue p0.h 239; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1] 240; CHECK-NEXT: ret 241 %ptr = getelementptr inbounds half, ptr %addr, i64 %idx 242 %load = load <8 x half>, ptr %ptr 243 %1 = tail call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half> undef, <8 x half> %load, i64 0) 244 %2 = tail call <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half> %1, i64 0) 245 ret <vscale x 8 x half> %2 246} 247 248define <vscale x 8 x bfloat> @ld1rqh_bf16_imm_dupqlane(<vscale x 8 x i1> %pred, ptr %addr) { 249; CHECK-LABEL: ld1rqh_bf16_imm_dupqlane: 250; CHECK: // %bb.0: 251; CHECK-NEXT: ptrue p0.h 252; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0, #-16] 253; CHECK-NEXT: ret 254 %ptr = getelementptr inbounds <8 x bfloat>, ptr %addr, i16 -1 255 %load = load <8 x bfloat>, ptr %ptr 256 %1 = tail call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> undef, <8 x bfloat> %load, i64 0) 257 %2 = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat> %1, i64 0) 258 ret <vscale x 8 x bfloat> %2 259} 260 261define <vscale x 8 x bfloat> @ld1rqh_bf16_scalar_dupqlane(<vscale x 8 x i1> %pred, ptr %addr, i64 %idx) { 262; CHECK-LABEL: ld1rqh_bf16_scalar_dupqlane: 263; CHECK: // %bb.0: 264; CHECK-NEXT: ptrue p0.h 265; CHECK-NEXT: ld1rqh { z0.h }, p0/z, [x0, x1, lsl #1] 266; CHECK-NEXT: ret 267 %ptr = getelementptr inbounds bfloat, ptr %addr, i64 %idx 268 %load = load <8 x bfloat>, ptr %ptr 269 %1 = tail call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> undef, <8 x bfloat> %load, i64 0) 270 %2 = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat> %1, i64 0) 271 ret <vscale x 8 x bfloat> %2 272} 273 274; 275; LD1RQW 276; 277 278define <vscale x 4 x i32> @ld1rqw_i32(<vscale x 4 x i1> %pred, ptr %addr) { 279; CHECK-LABEL: ld1rqw_i32: 280; CHECK: // %bb.0: 281; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x0] 282; CHECK-NEXT: ret 283 %res = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1rq.nxv4i32(<vscale x 4 x i1> %pred, ptr %addr) 284 ret <vscale x 4 x i32> %res 285} 286 287define <vscale x 4 x float> @ld1rqw_f32(<vscale x 4 x i1> %pred, ptr %addr) { 288; CHECK-LABEL: ld1rqw_f32: 289; CHECK: // %bb.0: 290; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x0] 291; CHECK-NEXT: ret 292 %res = call <vscale x 4 x float> @llvm.aarch64.sve.ld1rq.nxv4f32(<vscale x 4 x i1> %pred, ptr %addr) 293 ret <vscale x 4 x float> %res 294} 295 296define <vscale x 4 x i32> @ld1rqw_i32_imm(<vscale x 4 x i1> %pred, ptr %addr) { 297; CHECK-LABEL: ld1rqw_i32_imm: 298; CHECK: // %bb.0: 299; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x0, #112] 300; CHECK-NEXT: ret 301 %ptr = getelementptr inbounds i32, ptr %addr, i32 28 302 %res = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1rq.nxv4i32(<vscale x 4 x i1> %pred, ptr %ptr) 303 ret <vscale x 4 x i32> %res 304} 305 306define <vscale x 4 x float> @ld1rqw_f32_imm(<vscale x 4 x i1> %pred, ptr %addr) { 307; CHECK-LABEL: ld1rqw_f32_imm: 308; CHECK: // %bb.0: 309; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x0, #32] 310; CHECK-NEXT: ret 311 %ptr = getelementptr inbounds float, ptr %addr, i32 8 312 %res = call <vscale x 4 x float> @llvm.aarch64.sve.ld1rq.nxv4f32(<vscale x 4 x i1> %pred, ptr %ptr) 313 ret <vscale x 4 x float> %res 314} 315 316define <vscale x 4 x i32> @ld1rqw_i32_scalar(<vscale x 4 x i1> %pred, ptr %base, i64 %idx) { 317; CHECK-LABEL: ld1rqw_i32_scalar: 318; CHECK: // %bb.0: 319; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x0, x1, lsl #2] 320; CHECK-NEXT: ret 321 %ptr = getelementptr inbounds i32, ptr %base, i64 %idx 322 %res = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1rq.nxv4i32(<vscale x 4 x i1> %pred, ptr %ptr) 323 ret <vscale x 4 x i32> %res 324} 325 326define <vscale x 4 x float> @ld1rqw_f32_scalar(<vscale x 4 x i1> %pred, ptr %base, i64 %idx) { 327; CHECK-LABEL: ld1rqw_f32_scalar: 328; CHECK: // %bb.0: 329; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x0, x1, lsl #2] 330; CHECK-NEXT: ret 331 %ptr = getelementptr inbounds float, ptr %base, i64 %idx 332 %res = call <vscale x 4 x float> @llvm.aarch64.sve.ld1rq.nxv4f32(<vscale x 4 x i1> %pred, ptr %ptr) 333 ret <vscale x 4 x float> %res 334} 335 336define <vscale x 4 x i32> @ld1rqw_i32_imm_dupqlane(<vscale x 4 x i1> %pred, ptr %addr) { 337; CHECK-LABEL: ld1rqw_i32_imm_dupqlane: 338; CHECK: // %bb.0: 339; CHECK-NEXT: ptrue p0.s 340; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x0, #16] 341; CHECK-NEXT: ret 342 %ptr = getelementptr inbounds <4 x i32>, ptr %addr, i32 1 343 %load = load <4 x i32>, ptr %ptr 344 %1 = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> %load, i64 0) 345 %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32> %1, i64 0) 346 ret <vscale x 4 x i32> %2 347} 348 349define <vscale x 4 x i32> @ld1rqw_i32_scalar_dupqlane(<vscale x 4 x i1> %pred, ptr %addr, i64 %idx) { 350; CHECK-LABEL: ld1rqw_i32_scalar_dupqlane: 351; CHECK: // %bb.0: 352; CHECK-NEXT: ptrue p0.s 353; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x0, x1, lsl #2] 354; CHECK-NEXT: ret 355 %ptr = getelementptr inbounds i32, ptr %addr, i64 %idx 356 %load = load <4 x i32>, ptr %ptr 357 %1 = tail call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> %load, i64 0) 358 %2 = tail call <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32> %1, i64 0) 359 ret <vscale x 4 x i32> %2 360} 361 362define <vscale x 4 x float> @ld1rqw_f32_imm_dupqlane(<vscale x 4 x i1> %pred, ptr %addr) { 363; CHECK-LABEL: ld1rqw_f32_imm_dupqlane: 364; CHECK: // %bb.0: 365; CHECK-NEXT: ptrue p0.s 366; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x0, #16] 367; CHECK-NEXT: ret 368 %ptr = getelementptr inbounds <4 x float>, ptr %addr, i32 1 369 %load = load <4 x float>, ptr %ptr 370 %1 = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> %load, i64 0) 371 %2 = tail call <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float> %1, i64 0) 372 ret <vscale x 4 x float> %2 373} 374 375define <vscale x 4 x float> @ld1rqw_f32_scalar_dupqlane(<vscale x 4 x i1> %pred, ptr %addr, i64 %idx) { 376; CHECK-LABEL: ld1rqw_f32_scalar_dupqlane: 377; CHECK: // %bb.0: 378; CHECK-NEXT: ptrue p0.s 379; CHECK-NEXT: ld1rqw { z0.s }, p0/z, [x0, x1, lsl #2] 380; CHECK-NEXT: ret 381 %ptr = getelementptr inbounds float, ptr %addr, i64 %idx 382 %load = load <4 x float>, ptr %ptr 383 %1 = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> undef, <4 x float> %load, i64 0) 384 %2 = tail call <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float> %1, i64 0) 385 ret <vscale x 4 x float> %2 386} 387 388; 389; LD1RQD 390; 391 392define <vscale x 2 x i64> @ld1rqd_i64(<vscale x 2 x i1> %pred, ptr %addr) { 393; CHECK-LABEL: ld1rqd_i64: 394; CHECK: // %bb.0: 395; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x0] 396; CHECK-NEXT: ret 397 %res = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1rq.nxv2i64(<vscale x 2 x i1> %pred, ptr %addr) 398 ret <vscale x 2 x i64> %res 399} 400 401define <vscale x 2 x double> @ld1rqd_f64(<vscale x 2 x i1> %pred, ptr %addr) { 402; CHECK-LABEL: ld1rqd_f64: 403; CHECK: // %bb.0: 404; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x0] 405; CHECK-NEXT: ret 406 %res = call <vscale x 2 x double> @llvm.aarch64.sve.ld1rq.nxv2f64(<vscale x 2 x i1> %pred, ptr %addr) 407 ret <vscale x 2 x double> %res 408} 409 410define <vscale x 2 x i64> @ld1rqd_i64_imm(<vscale x 2 x i1> %pred, ptr %addr) { 411; CHECK-LABEL: ld1rqd_i64_imm: 412; CHECK: // %bb.0: 413; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x0, #64] 414; CHECK-NEXT: ret 415 %ptr = getelementptr inbounds i64, ptr %addr, i64 8 416 %res = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1rq.nxv2i64(<vscale x 2 x i1> %pred, ptr %ptr) 417 ret <vscale x 2 x i64> %res 418} 419 420define <vscale x 2 x double> @ld1rqd_f64_imm(<vscale x 2 x i1> %pred, ptr %addr) { 421; CHECK-LABEL: ld1rqd_f64_imm: 422; CHECK: // %bb.0: 423; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x0, #-128] 424; CHECK-NEXT: ret 425 %ptr = getelementptr inbounds double, ptr %addr, i64 -16 426 %res = call <vscale x 2 x double> @llvm.aarch64.sve.ld1rq.nxv2f64(<vscale x 2 x i1> %pred, ptr %ptr) 427 ret <vscale x 2 x double> %res 428} 429 430define <vscale x 2 x i64> @ld1rqd_i64_scalar(<vscale x 2 x i1> %pred, ptr %base, i64 %idx) { 431; CHECK-LABEL: ld1rqd_i64_scalar: 432; CHECK: // %bb.0: 433; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x0, x1, lsl #3] 434; CHECK-NEXT: ret 435 %ptr = getelementptr inbounds i64, ptr %base, i64 %idx 436 %res = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1rq.nxv2i64(<vscale x 2 x i1> %pred, ptr %ptr) 437 ret <vscale x 2 x i64> %res 438} 439 440define <vscale x 2 x double> @ld1rqd_f64_scalar(<vscale x 2 x i1> %pred, ptr %base, i64 %idx) { 441; CHECK-LABEL: ld1rqd_f64_scalar: 442; CHECK: // %bb.0: 443; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x0, x1, lsl #3] 444; CHECK-NEXT: ret 445 %ptr = getelementptr inbounds double, ptr %base, i64 %idx 446 %res = call <vscale x 2 x double> @llvm.aarch64.sve.ld1rq.nxv2f64(<vscale x 2 x i1> %pred, ptr %ptr) 447 ret <vscale x 2 x double> %res 448} 449 450define <vscale x 2 x i64> @ld1rqd_i64_imm_dupqlane(<vscale x 2 x i1> %pred, ptr %addr) { 451; CHECK-LABEL: ld1rqd_i64_imm_dupqlane: 452; CHECK: // %bb.0: 453; CHECK-NEXT: ptrue p0.d 454; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x0, #16] 455; CHECK-NEXT: ret 456 %ptr = getelementptr inbounds <2 x i64>, ptr %addr, i64 1 457 %load = load <2 x i64>, ptr %ptr 458 %1 = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> %load, i64 0) 459 %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> %1, i64 0) 460 ret <vscale x 2 x i64> %2 461} 462 463define <vscale x 2 x i64> @ld1rqd_i64_scalar_dupqlane(<vscale x 2 x i1> %pred, ptr %addr, i64 %idx) { 464; CHECK-LABEL: ld1rqd_i64_scalar_dupqlane: 465; CHECK: // %bb.0: 466; CHECK-NEXT: ptrue p0.d 467; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x0, x1, lsl #3] 468; CHECK-NEXT: ret 469 %ptr = getelementptr inbounds i64, ptr %addr, i64 %idx 470 %load = load <2 x i64>, ptr %ptr 471 %1 = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> %load, i64 0) 472 %2 = tail call <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64> %1, i64 0) 473 ret <vscale x 2 x i64> %2 474} 475 476define <vscale x 2 x double> @ld1rqd_f64_imm_dupqlane(<vscale x 2 x i1> %pred, ptr %addr) { 477; CHECK-LABEL: ld1rqd_f64_imm_dupqlane: 478; CHECK: // %bb.0: 479; CHECK-NEXT: ptrue p0.d 480; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x0, #16] 481; CHECK-NEXT: ret 482 %ptr = getelementptr inbounds <2 x double>, ptr %addr, i64 1 483 %load = load <2 x double>, ptr %ptr 484 %1 = tail call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double> undef, <2 x double> %load, i64 0) 485 %2 = tail call <vscale x 2 x double> @llvm.aarch64.sve.dupq.lane.nxv2f64(<vscale x 2 x double> %1, i64 0) 486 ret <vscale x 2 x double> %2 487} 488 489define <vscale x 2 x double> @ld1rqd_f64_scalar_dupqlane(<vscale x 2 x i1> %pred, ptr %addr, i64 %idx) { 490; CHECK-LABEL: ld1rqd_f64_scalar_dupqlane: 491; CHECK: // %bb.0: 492; CHECK-NEXT: ptrue p0.d 493; CHECK-NEXT: ld1rqd { z0.d }, p0/z, [x0, x1, lsl #3] 494; CHECK-NEXT: ret 495 %ptr = getelementptr inbounds double, ptr %addr, i64 %idx 496 %load = load <2 x double>, ptr %ptr 497 %1 = tail call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double> undef, <2 x double> %load, i64 0) 498 %2 = tail call <vscale x 2 x double> @llvm.aarch64.sve.dupq.lane.nxv2f64(<vscale x 2 x double> %1, i64 0) 499 ret <vscale x 2 x double> %2 500} 501 502; 503; LDNT1B 504; 505 506define <vscale x 16 x i8> @ldnt1b_i8(<vscale x 16 x i1> %pred, ptr %addr) { 507; CHECK-LABEL: ldnt1b_i8: 508; CHECK: // %bb.0: 509; CHECK-NEXT: ldnt1b { z0.b }, p0/z, [x0] 510; CHECK-NEXT: ret 511 %res = call <vscale x 16 x i8> @llvm.aarch64.sve.ldnt1.nxv16i8(<vscale x 16 x i1> %pred, 512 ptr %addr) 513 ret <vscale x 16 x i8> %res 514} 515 516; 517; LDNT1H 518; 519 520define <vscale x 8 x i16> @ldnt1h_i16(<vscale x 8 x i1> %pred, ptr %addr) { 521; CHECK-LABEL: ldnt1h_i16: 522; CHECK: // %bb.0: 523; CHECK-NEXT: ldnt1h { z0.h }, p0/z, [x0] 524; CHECK-NEXT: ret 525 %res = call <vscale x 8 x i16> @llvm.aarch64.sve.ldnt1.nxv8i16(<vscale x 8 x i1> %pred, 526 ptr %addr) 527 ret <vscale x 8 x i16> %res 528} 529 530define <vscale x 8 x half> @ldnt1h_f16(<vscale x 8 x i1> %pred, ptr %addr) { 531; CHECK-LABEL: ldnt1h_f16: 532; CHECK: // %bb.0: 533; CHECK-NEXT: ldnt1h { z0.h }, p0/z, [x0] 534; CHECK-NEXT: ret 535 %res = call <vscale x 8 x half> @llvm.aarch64.sve.ldnt1.nxv8f16(<vscale x 8 x i1> %pred, 536 ptr %addr) 537 ret <vscale x 8 x half> %res 538} 539 540define <vscale x 8 x bfloat> @ldnt1h_bf16(<vscale x 8 x i1> %pred, ptr %addr) { 541; CHECK-LABEL: ldnt1h_bf16: 542; CHECK: // %bb.0: 543; CHECK-NEXT: ldnt1h { z0.h }, p0/z, [x0] 544; CHECK-NEXT: ret 545 %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1> %pred, 546 ptr %addr) 547 ret <vscale x 8 x bfloat> %res 548} 549 550; 551; LDNT1W 552; 553 554define <vscale x 4 x i32> @ldnt1w_i32(<vscale x 4 x i1> %pred, ptr %addr) { 555; CHECK-LABEL: ldnt1w_i32: 556; CHECK: // %bb.0: 557; CHECK-NEXT: ldnt1w { z0.s }, p0/z, [x0] 558; CHECK-NEXT: ret 559 %res = call <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.nxv4i32(<vscale x 4 x i1> %pred, 560 ptr %addr) 561 ret <vscale x 4 x i32> %res 562} 563 564define <vscale x 4 x float> @ldnt1w_f32(<vscale x 4 x i1> %pred, ptr %addr) { 565; CHECK-LABEL: ldnt1w_f32: 566; CHECK: // %bb.0: 567; CHECK-NEXT: ldnt1w { z0.s }, p0/z, [x0] 568; CHECK-NEXT: ret 569 %res = call <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.nxv4f32(<vscale x 4 x i1> %pred, 570 ptr %addr) 571 ret <vscale x 4 x float> %res 572} 573 574; 575; LDNT1D 576; 577 578define <vscale x 2 x i64> @ldnt1d_i64(<vscale x 2 x i1> %pred, ptr %addr) { 579; CHECK-LABEL: ldnt1d_i64: 580; CHECK: // %bb.0: 581; CHECK-NEXT: ldnt1d { z0.d }, p0/z, [x0] 582; CHECK-NEXT: ret 583 %res = call <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.nxv2i64(<vscale x 2 x i1> %pred, 584 ptr %addr) 585 ret <vscale x 2 x i64> %res 586} 587 588define <vscale x 2 x double> @ldnt1d_f64(<vscale x 2 x i1> %pred, ptr %addr) { 589; CHECK-LABEL: ldnt1d_f64: 590; CHECK: // %bb.0: 591; CHECK-NEXT: ldnt1d { z0.d }, p0/z, [x0] 592; CHECK-NEXT: ret 593 %res = call <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.nxv2f64(<vscale x 2 x i1> %pred, 594 ptr %addr) 595 ret <vscale x 2 x double> %res 596} 597 598 599declare <vscale x 16 x i8> @llvm.aarch64.sve.ld1rq.nxv16i8(<vscale x 16 x i1>, ptr) 600declare <vscale x 8 x i16> @llvm.aarch64.sve.ld1rq.nxv8i16(<vscale x 8 x i1>, ptr) 601declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1rq.nxv4i32(<vscale x 4 x i1>, ptr) 602declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1rq.nxv2i64(<vscale x 2 x i1>, ptr) 603declare <vscale x 8 x half> @llvm.aarch64.sve.ld1rq.nxv8f16(<vscale x 8 x i1>, ptr) 604declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1rq.nxv8bf16(<vscale x 8 x i1>, ptr) 605declare <vscale x 4 x float> @llvm.aarch64.sve.ld1rq.nxv4f32(<vscale x 4 x i1>, ptr) 606declare <vscale x 2 x double> @llvm.aarch64.sve.ld1rq.nxv2f64(<vscale x 2 x i1>, ptr) 607 608declare <vscale x 16 x i8> @llvm.aarch64.sve.ldnt1.nxv16i8(<vscale x 16 x i1>, ptr) 609declare <vscale x 8 x i16> @llvm.aarch64.sve.ldnt1.nxv8i16(<vscale x 8 x i1>, ptr) 610declare <vscale x 4 x i32> @llvm.aarch64.sve.ldnt1.nxv4i32(<vscale x 4 x i1>, ptr) 611declare <vscale x 2 x i64> @llvm.aarch64.sve.ldnt1.nxv2i64(<vscale x 2 x i1>, ptr) 612declare <vscale x 8 x half> @llvm.aarch64.sve.ldnt1.nxv8f16(<vscale x 8 x i1>, ptr) 613declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ldnt1.nxv8bf16(<vscale x 8 x i1>, ptr) 614declare <vscale x 4 x float> @llvm.aarch64.sve.ldnt1.nxv4f32(<vscale x 4 x i1>, ptr) 615declare <vscale x 2 x double> @llvm.aarch64.sve.ldnt1.nxv2f64(<vscale x 2 x i1>, ptr) 616 617declare <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64>, <2 x i64>, i64) 618declare <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double>, <2 x double>, i64) 619declare <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32>, <4 x i32>, i64) 620declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float>, <4 x float>, i64) 621declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16>, <8 x i16>, i64) 622declare <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v8f16(<vscale x 8 x half>, <8 x half>, i64) 623declare <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat>, <8 x bfloat>, i64) 624declare <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8>, <16 x i8>, i64) 625 626declare <vscale x 2 x i64> @llvm.aarch64.sve.dupq.lane.nxv2i64(<vscale x 2 x i64>, i64) 627declare <vscale x 2 x double> @llvm.aarch64.sve.dupq.lane.nxv2f64(<vscale x 2 x double>, i64) 628declare <vscale x 4 x i32> @llvm.aarch64.sve.dupq.lane.nxv4i32(<vscale x 4 x i32>, i64) 629declare <vscale x 4 x float> @llvm.aarch64.sve.dupq.lane.nxv4f32(<vscale x 4 x float>, i64) 630declare <vscale x 8 x i16> @llvm.aarch64.sve.dupq.lane.nxv8i16(<vscale x 8 x i16>, i64) 631declare <vscale x 8 x half> @llvm.aarch64.sve.dupq.lane.nxv8f16(<vscale x 8 x half>, i64) 632declare <vscale x 8 x bfloat> @llvm.aarch64.sve.dupq.lane.nxv8bf16(<vscale x 8 x bfloat>, i64) 633declare <vscale x 16 x i8> @llvm.aarch64.sve.dupq.lane.nxv16i8(<vscale x 16 x i8>, i64) 634