1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=aarch64 -mattr=+fullfp16,+bf16,+sve | FileCheck %s 3 4define <8 x i8> @loadv8i8(ptr %p) { 5; CHECK-LABEL: loadv8i8: 6; CHECK: // %bb.0: 7; CHECK-NEXT: ldr b0, [x0] 8; CHECK-NEXT: ret 9 %l = load i8, ptr %p 10 %v = insertelement <8 x i8> zeroinitializer, i8 %l, i32 0 11 ret <8 x i8> %v 12} 13 14define <16 x i8> @loadv16i8(ptr %p) { 15; CHECK-LABEL: loadv16i8: 16; CHECK: // %bb.0: 17; CHECK-NEXT: ldr b0, [x0] 18; CHECK-NEXT: ret 19 %l = load i8, ptr %p 20 %v = insertelement <16 x i8> zeroinitializer, i8 %l, i32 0 21 ret <16 x i8> %v 22} 23 24define <4 x i16> @loadv4i16(ptr %p) { 25; CHECK-LABEL: loadv4i16: 26; CHECK: // %bb.0: 27; CHECK-NEXT: ldr h0, [x0] 28; CHECK-NEXT: ret 29 %l = load i16, ptr %p 30 %v = insertelement <4 x i16> zeroinitializer, i16 %l, i32 0 31 ret <4 x i16> %v 32} 33 34define <8 x i16> @loadv8i16(ptr %p) { 35; CHECK-LABEL: loadv8i16: 36; CHECK: // %bb.0: 37; CHECK-NEXT: ldr h0, [x0] 38; CHECK-NEXT: ret 39 %l = load i16, ptr %p 40 %v = insertelement <8 x i16> zeroinitializer, i16 %l, i32 0 41 ret <8 x i16> %v 42} 43 44define <2 x i32> @loadv2i32(ptr %p) { 45; CHECK-LABEL: loadv2i32: 46; CHECK: // %bb.0: 47; CHECK-NEXT: ldr s0, [x0] 48; CHECK-NEXT: ret 49 %l = load i32, ptr %p 50 %v = insertelement <2 x i32> zeroinitializer, i32 %l, i32 0 51 ret <2 x i32> %v 52} 53 54define <4 x i32> @loadv4i32(ptr %p) { 55; CHECK-LABEL: loadv4i32: 56; CHECK: // %bb.0: 57; CHECK-NEXT: ldr s0, [x0] 58; CHECK-NEXT: ret 59 %l = load i32, ptr %p 60 %v = insertelement <4 x i32> zeroinitializer, i32 %l, i32 0 61 ret <4 x i32> %v 62} 63 64define <2 x i64> @loadv2i64(ptr %p) { 65; CHECK-LABEL: loadv2i64: 66; CHECK: // %bb.0: 67; CHECK-NEXT: ldr d0, [x0] 68; CHECK-NEXT: ret 69 %l = load i64, ptr %p 70 %v = insertelement <2 x i64> zeroinitializer, i64 %l, i32 0 71 ret <2 x i64> %v 72} 73 74 75define <4 x half> @loadv4f16(ptr %p) { 76; CHECK-LABEL: loadv4f16: 77; CHECK: // %bb.0: 78; CHECK-NEXT: ldr h0, [x0] 79; CHECK-NEXT: ret 80 %l = load half, ptr %p 81 %v = insertelement <4 x half> zeroinitializer, half %l, i32 0 82 ret <4 x half> %v 83} 84 85define <8 x half> @loadv8f16(ptr %p) { 86; CHECK-LABEL: loadv8f16: 87; CHECK: // %bb.0: 88; CHECK-NEXT: ldr h0, [x0] 89; CHECK-NEXT: ret 90 %l = load half, ptr %p 91 %v = insertelement <8 x half> zeroinitializer, half %l, i32 0 92 ret <8 x half> %v 93} 94 95define <4 x bfloat> @loadv4bf16(ptr %p) { 96; CHECK-LABEL: loadv4bf16: 97; CHECK: // %bb.0: 98; CHECK-NEXT: ldr h0, [x0] 99; CHECK-NEXT: ret 100 %l = load bfloat, ptr %p 101 %v = insertelement <4 x bfloat> zeroinitializer, bfloat %l, i32 0 102 ret <4 x bfloat> %v 103} 104 105define <8 x bfloat> @loadv8bf16(ptr %p) { 106; CHECK-LABEL: loadv8bf16: 107; CHECK: // %bb.0: 108; CHECK-NEXT: ldr h0, [x0] 109; CHECK-NEXT: ret 110 %l = load bfloat, ptr %p 111 %v = insertelement <8 x bfloat> zeroinitializer, bfloat %l, i32 0 112 ret <8 x bfloat> %v 113} 114 115define <2 x float> @loadv2f32(ptr %p) { 116; CHECK-LABEL: loadv2f32: 117; CHECK: // %bb.0: 118; CHECK-NEXT: ldr s0, [x0] 119; CHECK-NEXT: ret 120 %l = load float, ptr %p 121 %v = insertelement <2 x float> zeroinitializer, float %l, i32 0 122 ret <2 x float> %v 123} 124 125define <4 x float> @loadv4f32(ptr %p) { 126; CHECK-LABEL: loadv4f32: 127; CHECK: // %bb.0: 128; CHECK-NEXT: ldr s0, [x0] 129; CHECK-NEXT: ret 130 %l = load float, ptr %p 131 %v = insertelement <4 x float> zeroinitializer, float %l, i32 0 132 ret <4 x float> %v 133} 134 135define <2 x double> @loadv2f64(ptr %p) { 136; CHECK-LABEL: loadv2f64: 137; CHECK: // %bb.0: 138; CHECK-NEXT: ldr d0, [x0] 139; CHECK-NEXT: ret 140 %l = load double, ptr %p 141 %v = insertelement <2 x double> zeroinitializer, double %l, i32 0 142 ret <2 x double> %v 143} 144 145 146; Unscaled 147 148define <8 x i8> @loadv8i8_offset(ptr %p) { 149; CHECK-LABEL: loadv8i8_offset: 150; CHECK: // %bb.0: 151; CHECK-NEXT: ldr b0, [x0, #1] 152; CHECK-NEXT: ret 153 %g = getelementptr inbounds i8, ptr %p, i64 1 154 %l = load i8, ptr %g 155 %v = insertelement <8 x i8> zeroinitializer, i8 %l, i32 0 156 ret <8 x i8> %v 157} 158 159define <16 x i8> @loadv16i8_offset(ptr %p) { 160; CHECK-LABEL: loadv16i8_offset: 161; CHECK: // %bb.0: 162; CHECK-NEXT: ldr b0, [x0, #1] 163; CHECK-NEXT: ret 164 %g = getelementptr inbounds i8, ptr %p, i64 1 165 %l = load i8, ptr %g 166 %v = insertelement <16 x i8> zeroinitializer, i8 %l, i32 0 167 ret <16 x i8> %v 168} 169 170define <4 x i16> @loadv4i16_offset(ptr %p) { 171; CHECK-LABEL: loadv4i16_offset: 172; CHECK: // %bb.0: 173; CHECK-NEXT: ldur h0, [x0, #1] 174; CHECK-NEXT: ret 175 %g = getelementptr inbounds i8, ptr %p, i64 1 176 %l = load i16, ptr %g 177 %v = insertelement <4 x i16> zeroinitializer, i16 %l, i32 0 178 ret <4 x i16> %v 179} 180 181define <8 x i16> @loadv8i16_offset(ptr %p) { 182; CHECK-LABEL: loadv8i16_offset: 183; CHECK: // %bb.0: 184; CHECK-NEXT: ldur h0, [x0, #1] 185; CHECK-NEXT: ret 186 %g = getelementptr inbounds i8, ptr %p, i64 1 187 %l = load i16, ptr %g 188 %v = insertelement <8 x i16> zeroinitializer, i16 %l, i32 0 189 ret <8 x i16> %v 190} 191 192define <2 x i32> @loadv2i32_offset(ptr %p) { 193; CHECK-LABEL: loadv2i32_offset: 194; CHECK: // %bb.0: 195; CHECK-NEXT: ldur s0, [x0, #1] 196; CHECK-NEXT: ret 197 %g = getelementptr inbounds i8, ptr %p, i64 1 198 %l = load i32, ptr %g 199 %v = insertelement <2 x i32> zeroinitializer, i32 %l, i32 0 200 ret <2 x i32> %v 201} 202 203define <4 x i32> @loadv4i32_offset(ptr %p) { 204; CHECK-LABEL: loadv4i32_offset: 205; CHECK: // %bb.0: 206; CHECK-NEXT: ldur s0, [x0, #1] 207; CHECK-NEXT: ret 208 %g = getelementptr inbounds i8, ptr %p, i64 1 209 %l = load i32, ptr %g 210 %v = insertelement <4 x i32> zeroinitializer, i32 %l, i32 0 211 ret <4 x i32> %v 212} 213 214define <2 x i64> @loadv2i64_offset(ptr %p) { 215; CHECK-LABEL: loadv2i64_offset: 216; CHECK: // %bb.0: 217; CHECK-NEXT: ldur d0, [x0, #1] 218; CHECK-NEXT: ret 219 %g = getelementptr inbounds i8, ptr %p, i64 1 220 %l = load i64, ptr %g 221 %v = insertelement <2 x i64> zeroinitializer, i64 %l, i32 0 222 ret <2 x i64> %v 223} 224 225 226define <4 x half> @loadv4f16_offset(ptr %p) { 227; CHECK-LABEL: loadv4f16_offset: 228; CHECK: // %bb.0: 229; CHECK-NEXT: ldur h0, [x0, #1] 230; CHECK-NEXT: ret 231 %g = getelementptr inbounds i8, ptr %p, i64 1 232 %l = load half, ptr %g 233 %v = insertelement <4 x half> zeroinitializer, half %l, i32 0 234 ret <4 x half> %v 235} 236 237define <8 x half> @loadv8f16_offset(ptr %p) { 238; CHECK-LABEL: loadv8f16_offset: 239; CHECK: // %bb.0: 240; CHECK-NEXT: ldur h0, [x0, #1] 241; CHECK-NEXT: ret 242 %g = getelementptr inbounds i8, ptr %p, i64 1 243 %l = load half, ptr %g 244 %v = insertelement <8 x half> zeroinitializer, half %l, i32 0 245 ret <8 x half> %v 246} 247 248define <4 x bfloat> @loadv4bf16_offset(ptr %p) { 249; CHECK-LABEL: loadv4bf16_offset: 250; CHECK: // %bb.0: 251; CHECK-NEXT: ldur h0, [x0, #1] 252; CHECK-NEXT: ret 253 %g = getelementptr inbounds i8, ptr %p, i64 1 254 %l = load bfloat, ptr %g 255 %v = insertelement <4 x bfloat> zeroinitializer, bfloat %l, i32 0 256 ret <4 x bfloat> %v 257} 258 259define <8 x bfloat> @loadv8bf16_offset(ptr %p) { 260; CHECK-LABEL: loadv8bf16_offset: 261; CHECK: // %bb.0: 262; CHECK-NEXT: ldur h0, [x0, #1] 263; CHECK-NEXT: ret 264 %g = getelementptr inbounds i8, ptr %p, i64 1 265 %l = load bfloat, ptr %g 266 %v = insertelement <8 x bfloat> zeroinitializer, bfloat %l, i32 0 267 ret <8 x bfloat> %v 268} 269 270define <2 x float> @loadv2f32_offset(ptr %p) { 271; CHECK-LABEL: loadv2f32_offset: 272; CHECK: // %bb.0: 273; CHECK-NEXT: ldur s0, [x0, #1] 274; CHECK-NEXT: ret 275 %g = getelementptr inbounds i8, ptr %p, i64 1 276 %l = load float, ptr %g 277 %v = insertelement <2 x float> zeroinitializer, float %l, i32 0 278 ret <2 x float> %v 279} 280 281define <4 x float> @loadv4f32_offset(ptr %p) { 282; CHECK-LABEL: loadv4f32_offset: 283; CHECK: // %bb.0: 284; CHECK-NEXT: ldur s0, [x0, #1] 285; CHECK-NEXT: ret 286 %g = getelementptr inbounds i8, ptr %p, i64 1 287 %l = load float, ptr %g 288 %v = insertelement <4 x float> zeroinitializer, float %l, i32 0 289 ret <4 x float> %v 290} 291 292define <2 x double> @loadv2f64_offset(ptr %p) { 293; CHECK-LABEL: loadv2f64_offset: 294; CHECK: // %bb.0: 295; CHECK-NEXT: ldur d0, [x0, #1] 296; CHECK-NEXT: ret 297 %g = getelementptr inbounds i8, ptr %p, i64 1 298 %l = load double, ptr %g 299 %v = insertelement <2 x double> zeroinitializer, double %l, i32 0 300 ret <2 x double> %v 301} 302 303 304define <8 x i8> @loadv8i8_noffset(ptr %p) { 305; CHECK-LABEL: loadv8i8_noffset: 306; CHECK: // %bb.0: 307; CHECK-NEXT: ldur b0, [x0, #-1] 308; CHECK-NEXT: ret 309 %g = getelementptr inbounds i8, ptr %p, i64 -1 310 %l = load i8, ptr %g 311 %v = insertelement <8 x i8> zeroinitializer, i8 %l, i32 0 312 ret <8 x i8> %v 313} 314 315define <16 x i8> @loadv16i8_noffset(ptr %p) { 316; CHECK-LABEL: loadv16i8_noffset: 317; CHECK: // %bb.0: 318; CHECK-NEXT: ldur b0, [x0, #-1] 319; CHECK-NEXT: ret 320 %g = getelementptr inbounds i8, ptr %p, i64 -1 321 %l = load i8, ptr %g 322 %v = insertelement <16 x i8> zeroinitializer, i8 %l, i32 0 323 ret <16 x i8> %v 324} 325 326define <4 x i16> @loadv4i16_noffset(ptr %p) { 327; CHECK-LABEL: loadv4i16_noffset: 328; CHECK: // %bb.0: 329; CHECK-NEXT: ldur h0, [x0, #-1] 330; CHECK-NEXT: ret 331 %g = getelementptr inbounds i8, ptr %p, i64 -1 332 %l = load i16, ptr %g 333 %v = insertelement <4 x i16> zeroinitializer, i16 %l, i32 0 334 ret <4 x i16> %v 335} 336 337define <8 x i16> @loadv8i16_noffset(ptr %p) { 338; CHECK-LABEL: loadv8i16_noffset: 339; CHECK: // %bb.0: 340; CHECK-NEXT: ldur h0, [x0, #-1] 341; CHECK-NEXT: ret 342 %g = getelementptr inbounds i8, ptr %p, i64 -1 343 %l = load i16, ptr %g 344 %v = insertelement <8 x i16> zeroinitializer, i16 %l, i32 0 345 ret <8 x i16> %v 346} 347 348define <2 x i32> @loadv2i32_noffset(ptr %p) { 349; CHECK-LABEL: loadv2i32_noffset: 350; CHECK: // %bb.0: 351; CHECK-NEXT: ldur s0, [x0, #-1] 352; CHECK-NEXT: ret 353 %g = getelementptr inbounds i8, ptr %p, i64 -1 354 %l = load i32, ptr %g 355 %v = insertelement <2 x i32> zeroinitializer, i32 %l, i32 0 356 ret <2 x i32> %v 357} 358 359define <4 x i32> @loadv4i32_noffset(ptr %p) { 360; CHECK-LABEL: loadv4i32_noffset: 361; CHECK: // %bb.0: 362; CHECK-NEXT: ldur s0, [x0, #-1] 363; CHECK-NEXT: ret 364 %g = getelementptr inbounds i8, ptr %p, i64 -1 365 %l = load i32, ptr %g 366 %v = insertelement <4 x i32> zeroinitializer, i32 %l, i32 0 367 ret <4 x i32> %v 368} 369 370define <2 x i64> @loadv2i64_noffset(ptr %p) { 371; CHECK-LABEL: loadv2i64_noffset: 372; CHECK: // %bb.0: 373; CHECK-NEXT: ldur d0, [x0, #-1] 374; CHECK-NEXT: ret 375 %g = getelementptr inbounds i8, ptr %p, i64 -1 376 %l = load i64, ptr %g 377 %v = insertelement <2 x i64> zeroinitializer, i64 %l, i32 0 378 ret <2 x i64> %v 379} 380 381define <4 x half> @loadv4f16_noffset(ptr %p) { 382; CHECK-LABEL: loadv4f16_noffset: 383; CHECK: // %bb.0: 384; CHECK-NEXT: ldur h0, [x0, #-1] 385; CHECK-NEXT: ret 386 %g = getelementptr inbounds i8, ptr %p, i64 -1 387 %l = load half, ptr %g 388 %v = insertelement <4 x half> zeroinitializer, half %l, i32 0 389 ret <4 x half> %v 390} 391 392define <8 x half> @loadv8f16_noffset(ptr %p) { 393; CHECK-LABEL: loadv8f16_noffset: 394; CHECK: // %bb.0: 395; CHECK-NEXT: ldur h0, [x0, #-1] 396; CHECK-NEXT: ret 397 %g = getelementptr inbounds i8, ptr %p, i64 -1 398 %l = load half, ptr %g 399 %v = insertelement <8 x half> zeroinitializer, half %l, i32 0 400 ret <8 x half> %v 401} 402 403define <4 x bfloat> @loadv4bf16_noffset(ptr %p) { 404; CHECK-LABEL: loadv4bf16_noffset: 405; CHECK: // %bb.0: 406; CHECK-NEXT: ldur h0, [x0, #-1] 407; CHECK-NEXT: ret 408 %g = getelementptr inbounds i8, ptr %p, i64 -1 409 %l = load bfloat, ptr %g 410 %v = insertelement <4 x bfloat> zeroinitializer, bfloat %l, i32 0 411 ret <4 x bfloat> %v 412} 413 414define <8 x bfloat> @loadv8bf16_noffset(ptr %p) { 415; CHECK-LABEL: loadv8bf16_noffset: 416; CHECK: // %bb.0: 417; CHECK-NEXT: ldur h0, [x0, #-1] 418; CHECK-NEXT: ret 419 %g = getelementptr inbounds i8, ptr %p, i64 -1 420 %l = load bfloat, ptr %g 421 %v = insertelement <8 x bfloat> zeroinitializer, bfloat %l, i32 0 422 ret <8 x bfloat> %v 423} 424 425define <2 x float> @loadv2f32_noffset(ptr %p) { 426; CHECK-LABEL: loadv2f32_noffset: 427; CHECK: // %bb.0: 428; CHECK-NEXT: ldur s0, [x0, #-1] 429; CHECK-NEXT: ret 430 %g = getelementptr inbounds i8, ptr %p, i64 -1 431 %l = load float, ptr %g 432 %v = insertelement <2 x float> zeroinitializer, float %l, i32 0 433 ret <2 x float> %v 434} 435 436define <4 x float> @loadv4f32_noffset(ptr %p) { 437; CHECK-LABEL: loadv4f32_noffset: 438; CHECK: // %bb.0: 439; CHECK-NEXT: ldur s0, [x0, #-1] 440; CHECK-NEXT: ret 441 %g = getelementptr inbounds i8, ptr %p, i64 -1 442 %l = load float, ptr %g 443 %v = insertelement <4 x float> zeroinitializer, float %l, i32 0 444 ret <4 x float> %v 445} 446 447define <2 x double> @loadv2f64_noffset(ptr %p) { 448; CHECK-LABEL: loadv2f64_noffset: 449; CHECK: // %bb.0: 450; CHECK-NEXT: ldur d0, [x0, #-1] 451; CHECK-NEXT: ret 452 %g = getelementptr inbounds i8, ptr %p, i64 -1 453 %l = load double, ptr %g 454 %v = insertelement <2 x double> zeroinitializer, double %l, i32 0 455 ret <2 x double> %v 456} 457 458 459; ROW addressing modes 460 461define <8 x i8> @loadv8i8_roW(ptr %p, i32 %o) { 462; CHECK-LABEL: loadv8i8_roW: 463; CHECK: // %bb.0: 464; CHECK-NEXT: ldr b0, [x0, w1, sxtw] 465; CHECK-NEXT: ret 466 %g = getelementptr inbounds i8, ptr %p, i32 %o 467 %l = load i8, ptr %g 468 %v = insertelement <8 x i8> zeroinitializer, i8 %l, i32 0 469 ret <8 x i8> %v 470} 471 472define <16 x i8> @loadv16i8_roW(ptr %p, i32 %o) { 473; CHECK-LABEL: loadv16i8_roW: 474; CHECK: // %bb.0: 475; CHECK-NEXT: ldr b0, [x0, w1, sxtw] 476; CHECK-NEXT: ret 477 %g = getelementptr inbounds i8, ptr %p, i32 %o 478 %l = load i8, ptr %g 479 %v = insertelement <16 x i8> zeroinitializer, i8 %l, i32 0 480 ret <16 x i8> %v 481} 482 483define <4 x i16> @loadv4i16_roW(ptr %p, i32 %o) { 484; CHECK-LABEL: loadv4i16_roW: 485; CHECK: // %bb.0: 486; CHECK-NEXT: ldr h0, [x0, w1, sxtw #1] 487; CHECK-NEXT: ret 488 %g = getelementptr inbounds i16, ptr %p, i32 %o 489 %l = load i16, ptr %g 490 %v = insertelement <4 x i16> zeroinitializer, i16 %l, i32 0 491 ret <4 x i16> %v 492} 493 494define <8 x i16> @loadv8i16_roW(ptr %p, i32 %o) { 495; CHECK-LABEL: loadv8i16_roW: 496; CHECK: // %bb.0: 497; CHECK-NEXT: ldr h0, [x0, w1, sxtw #1] 498; CHECK-NEXT: ret 499 %g = getelementptr inbounds i16, ptr %p, i32 %o 500 %l = load i16, ptr %g 501 %v = insertelement <8 x i16> zeroinitializer, i16 %l, i32 0 502 ret <8 x i16> %v 503} 504 505define <2 x i32> @loadv2i32_roW(ptr %p, i32 %o) { 506; CHECK-LABEL: loadv2i32_roW: 507; CHECK: // %bb.0: 508; CHECK-NEXT: ldr s0, [x0, w1, sxtw #2] 509; CHECK-NEXT: ret 510 %g = getelementptr inbounds i32, ptr %p, i32 %o 511 %l = load i32, ptr %g 512 %v = insertelement <2 x i32> zeroinitializer, i32 %l, i32 0 513 ret <2 x i32> %v 514} 515 516define <4 x i32> @loadv4i32_roW(ptr %p, i32 %o) { 517; CHECK-LABEL: loadv4i32_roW: 518; CHECK: // %bb.0: 519; CHECK-NEXT: ldr s0, [x0, w1, sxtw #2] 520; CHECK-NEXT: ret 521 %g = getelementptr inbounds i32, ptr %p, i32 %o 522 %l = load i32, ptr %g 523 %v = insertelement <4 x i32> zeroinitializer, i32 %l, i32 0 524 ret <4 x i32> %v 525} 526 527define <2 x i64> @loadv2i64_roW(ptr %p, i32 %o) { 528; CHECK-LABEL: loadv2i64_roW: 529; CHECK: // %bb.0: 530; CHECK-NEXT: ldr d0, [x0, w1, sxtw #3] 531; CHECK-NEXT: ret 532 %g = getelementptr inbounds i64, ptr %p, i32 %o 533 %l = load i64, ptr %g 534 %v = insertelement <2 x i64> zeroinitializer, i64 %l, i32 0 535 ret <2 x i64> %v 536} 537 538define <4 x half> @loadv4f16_roW(ptr %p, i32 %o) { 539; CHECK-LABEL: loadv4f16_roW: 540; CHECK: // %bb.0: 541; CHECK-NEXT: ldr h0, [x0, w1, sxtw #1] 542; CHECK-NEXT: ret 543 %g = getelementptr inbounds half, ptr %p, i32 %o 544 %l = load half, ptr %g 545 %v = insertelement <4 x half> zeroinitializer, half %l, i32 0 546 ret <4 x half> %v 547} 548 549define <8 x half> @loadv8f16_roW(ptr %p, i32 %o) { 550; CHECK-LABEL: loadv8f16_roW: 551; CHECK: // %bb.0: 552; CHECK-NEXT: ldr h0, [x0, w1, sxtw #1] 553; CHECK-NEXT: ret 554 %g = getelementptr inbounds half, ptr %p, i32 %o 555 %l = load half, ptr %g 556 %v = insertelement <8 x half> zeroinitializer, half %l, i32 0 557 ret <8 x half> %v 558} 559 560define <4 x bfloat> @loadv4bf16_roW(ptr %p, i32 %o) { 561; CHECK-LABEL: loadv4bf16_roW: 562; CHECK: // %bb.0: 563; CHECK-NEXT: ldr h0, [x0, w1, sxtw #1] 564; CHECK-NEXT: ret 565 %g = getelementptr inbounds bfloat, ptr %p, i32 %o 566 %l = load bfloat, ptr %g 567 %v = insertelement <4 x bfloat> zeroinitializer, bfloat %l, i32 0 568 ret <4 x bfloat> %v 569} 570 571define <8 x bfloat> @loadv8bf16_roW(ptr %p, i32 %o) { 572; CHECK-LABEL: loadv8bf16_roW: 573; CHECK: // %bb.0: 574; CHECK-NEXT: ldr h0, [x0, w1, sxtw #1] 575; CHECK-NEXT: ret 576 %g = getelementptr inbounds bfloat, ptr %p, i32 %o 577 %l = load bfloat, ptr %g 578 %v = insertelement <8 x bfloat> zeroinitializer, bfloat %l, i32 0 579 ret <8 x bfloat> %v 580} 581 582define <2 x float> @loadv2f32_roW(ptr %p, i32 %o) { 583; CHECK-LABEL: loadv2f32_roW: 584; CHECK: // %bb.0: 585; CHECK-NEXT: ldr s0, [x0, w1, sxtw #2] 586; CHECK-NEXT: ret 587 %g = getelementptr inbounds float, ptr %p, i32 %o 588 %l = load float, ptr %g 589 %v = insertelement <2 x float> zeroinitializer, float %l, i32 0 590 ret <2 x float> %v 591} 592 593define <4 x float> @loadv4f32_roW(ptr %p, i32 %o) { 594; CHECK-LABEL: loadv4f32_roW: 595; CHECK: // %bb.0: 596; CHECK-NEXT: ldr s0, [x0, w1, sxtw #2] 597; CHECK-NEXT: ret 598 %g = getelementptr inbounds float, ptr %p, i32 %o 599 %l = load float, ptr %g 600 %v = insertelement <4 x float> zeroinitializer, float %l, i32 0 601 ret <4 x float> %v 602} 603 604define <2 x double> @loadv2f64_roW(ptr %p, i32 %o) { 605; CHECK-LABEL: loadv2f64_roW: 606; CHECK: // %bb.0: 607; CHECK-NEXT: ldr d0, [x0, w1, sxtw #3] 608; CHECK-NEXT: ret 609 %g = getelementptr inbounds double, ptr %p, i32 %o 610 %l = load double, ptr %g 611 %v = insertelement <2 x double> zeroinitializer, double %l, i32 0 612 ret <2 x double> %v 613} 614 615 616; roX 617 618define <8 x i8> @loadv8i8_roX(ptr %p, i64 %o) { 619; CHECK-LABEL: loadv8i8_roX: 620; CHECK: // %bb.0: 621; CHECK-NEXT: ldr b0, [x0, x1] 622; CHECK-NEXT: ret 623 %g = getelementptr inbounds i8, ptr %p, i64 %o 624 %l = load i8, ptr %g 625 %v = insertelement <8 x i8> zeroinitializer, i8 %l, i32 0 626 ret <8 x i8> %v 627} 628 629define <16 x i8> @loadv16i8_roX(ptr %p, i64 %o) { 630; CHECK-LABEL: loadv16i8_roX: 631; CHECK: // %bb.0: 632; CHECK-NEXT: ldr b0, [x0, x1] 633; CHECK-NEXT: ret 634 %g = getelementptr inbounds i8, ptr %p, i64 %o 635 %l = load i8, ptr %g 636 %v = insertelement <16 x i8> zeroinitializer, i8 %l, i32 0 637 ret <16 x i8> %v 638} 639 640define <4 x i16> @loadv4i16_roX(ptr %p, i64 %o) { 641; CHECK-LABEL: loadv4i16_roX: 642; CHECK: // %bb.0: 643; CHECK-NEXT: ldr h0, [x0, x1, lsl #1] 644; CHECK-NEXT: ret 645 %g = getelementptr inbounds i16, ptr %p, i64 %o 646 %l = load i16, ptr %g 647 %v = insertelement <4 x i16> zeroinitializer, i16 %l, i32 0 648 ret <4 x i16> %v 649} 650 651define <8 x i16> @loadv8i16_roX(ptr %p, i64 %o) { 652; CHECK-LABEL: loadv8i16_roX: 653; CHECK: // %bb.0: 654; CHECK-NEXT: ldr h0, [x0, x1, lsl #1] 655; CHECK-NEXT: ret 656 %g = getelementptr inbounds i16, ptr %p, i64 %o 657 %l = load i16, ptr %g 658 %v = insertelement <8 x i16> zeroinitializer, i16 %l, i32 0 659 ret <8 x i16> %v 660} 661 662define <2 x i32> @loadv2i32_roX(ptr %p, i64 %o) { 663; CHECK-LABEL: loadv2i32_roX: 664; CHECK: // %bb.0: 665; CHECK-NEXT: ldr s0, [x0, x1, lsl #2] 666; CHECK-NEXT: ret 667 %g = getelementptr inbounds i32, ptr %p, i64 %o 668 %l = load i32, ptr %g 669 %v = insertelement <2 x i32> zeroinitializer, i32 %l, i32 0 670 ret <2 x i32> %v 671} 672 673define <4 x i32> @loadv4i32_roX(ptr %p, i64 %o) { 674; CHECK-LABEL: loadv4i32_roX: 675; CHECK: // %bb.0: 676; CHECK-NEXT: ldr s0, [x0, x1, lsl #2] 677; CHECK-NEXT: ret 678 %g = getelementptr inbounds i32, ptr %p, i64 %o 679 %l = load i32, ptr %g 680 %v = insertelement <4 x i32> zeroinitializer, i32 %l, i32 0 681 ret <4 x i32> %v 682} 683 684define <2 x i64> @loadv2i64_roX(ptr %p, i64 %o) { 685; CHECK-LABEL: loadv2i64_roX: 686; CHECK: // %bb.0: 687; CHECK-NEXT: ldr d0, [x0, x1, lsl #3] 688; CHECK-NEXT: ret 689 %g = getelementptr inbounds i64, ptr %p, i64 %o 690 %l = load i64, ptr %g 691 %v = insertelement <2 x i64> zeroinitializer, i64 %l, i32 0 692 ret <2 x i64> %v 693} 694 695define <4 x half> @loadv4f16_roX(ptr %p, i64 %o) { 696; CHECK-LABEL: loadv4f16_roX: 697; CHECK: // %bb.0: 698; CHECK-NEXT: ldr h0, [x0, x1, lsl #1] 699; CHECK-NEXT: ret 700 %g = getelementptr inbounds half, ptr %p, i64 %o 701 %l = load half, ptr %g 702 %v = insertelement <4 x half> zeroinitializer, half %l, i32 0 703 ret <4 x half> %v 704} 705 706define <8 x half> @loadv8f16_roX(ptr %p, i64 %o) { 707; CHECK-LABEL: loadv8f16_roX: 708; CHECK: // %bb.0: 709; CHECK-NEXT: ldr h0, [x0, x1, lsl #1] 710; CHECK-NEXT: ret 711 %g = getelementptr inbounds half, ptr %p, i64 %o 712 %l = load half, ptr %g 713 %v = insertelement <8 x half> zeroinitializer, half %l, i32 0 714 ret <8 x half> %v 715} 716 717define <4 x bfloat> @loadv4bf16_roX(ptr %p, i64 %o) { 718; CHECK-LABEL: loadv4bf16_roX: 719; CHECK: // %bb.0: 720; CHECK-NEXT: ldr h0, [x0, x1, lsl #1] 721; CHECK-NEXT: ret 722 %g = getelementptr inbounds bfloat, ptr %p, i64 %o 723 %l = load bfloat, ptr %g 724 %v = insertelement <4 x bfloat> zeroinitializer, bfloat %l, i32 0 725 ret <4 x bfloat> %v 726} 727 728define <8 x bfloat> @loadv8bf16_roX(ptr %p, i64 %o) { 729; CHECK-LABEL: loadv8bf16_roX: 730; CHECK: // %bb.0: 731; CHECK-NEXT: ldr h0, [x0, x1, lsl #1] 732; CHECK-NEXT: ret 733 %g = getelementptr inbounds bfloat, ptr %p, i64 %o 734 %l = load bfloat, ptr %g 735 %v = insertelement <8 x bfloat> zeroinitializer, bfloat %l, i32 0 736 ret <8 x bfloat> %v 737} 738 739define <2 x float> @loadv2f32_roX(ptr %p, i64 %o) { 740; CHECK-LABEL: loadv2f32_roX: 741; CHECK: // %bb.0: 742; CHECK-NEXT: ldr s0, [x0, x1, lsl #2] 743; CHECK-NEXT: ret 744 %g = getelementptr inbounds float, ptr %p, i64 %o 745 %l = load float, ptr %g 746 %v = insertelement <2 x float> zeroinitializer, float %l, i32 0 747 ret <2 x float> %v 748} 749 750define <4 x float> @loadv4f32_roX(ptr %p, i64 %o) { 751; CHECK-LABEL: loadv4f32_roX: 752; CHECK: // %bb.0: 753; CHECK-NEXT: ldr s0, [x0, x1, lsl #2] 754; CHECK-NEXT: ret 755 %g = getelementptr inbounds float, ptr %p, i64 %o 756 %l = load float, ptr %g 757 %v = insertelement <4 x float> zeroinitializer, float %l, i32 0 758 ret <4 x float> %v 759} 760 761define <2 x double> @loadv2f64_roX(ptr %p, i64 %o) { 762; CHECK-LABEL: loadv2f64_roX: 763; CHECK: // %bb.0: 764; CHECK-NEXT: ldr d0, [x0, x1, lsl #3] 765; CHECK-NEXT: ret 766 %g = getelementptr inbounds double, ptr %p, i64 %o 767 %l = load double, ptr %g 768 %v = insertelement <2 x double> zeroinitializer, double %l, i32 0 769 ret <2 x double> %v 770} 771 772 773define void @predictor_4x4_neon(ptr nocapture noundef writeonly %0, i64 noundef %1, ptr nocapture noundef readonly %2, ptr nocapture noundef readnone %3) { 774; CHECK-LABEL: predictor_4x4_neon: 775; CHECK: // %bb.0: 776; CHECK-NEXT: movi v0.2d, #0000000000000000 777; CHECK-NEXT: ldur w8, [x2, #2] 778; CHECK-NEXT: ldr s1, [x2] 779; CHECK-NEXT: ldur s2, [x2, #1] 780; CHECK-NEXT: ushll v3.8h, v2.8b, #1 781; CHECK-NEXT: mov v0.s[0], w8 782; CHECK-NEXT: lsr w8, w8, #24 783; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b 784; CHECK-NEXT: urhadd v1.8b, v1.8b, v2.8b 785; CHECK-NEXT: add v0.8h, v0.8h, v3.8h 786; CHECK-NEXT: dup v3.8b, w8 787; CHECK-NEXT: str s1, [x0] 788; CHECK-NEXT: lsl x8, x1, #1 789; CHECK-NEXT: rshrn v0.8b, v0.8h, #2 790; CHECK-NEXT: zip1 v2.2s, v1.2s, v3.2s 791; CHECK-NEXT: str s0, [x0, x1] 792; CHECK-NEXT: zip1 v3.2s, v0.2s, v3.2s 793; CHECK-NEXT: ext v2.8b, v2.8b, v0.8b, #1 794; CHECK-NEXT: ext v1.8b, v3.8b, v0.8b, #1 795; CHECK-NEXT: str s2, [x0, x8] 796; CHECK-NEXT: add x8, x8, x1 797; CHECK-NEXT: str s1, [x0, x8] 798; CHECK-NEXT: ret 799 %5 = load i32, ptr %2, align 4 800 %6 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %5, i64 0 801 %7 = bitcast <2 x i32> %6 to <8 x i8> 802 %8 = getelementptr inbounds i8, ptr %2, i64 1 803 %9 = load i32, ptr %8, align 4 804 %10 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %9, i64 0 805 %11 = bitcast <2 x i32> %10 to <8 x i8> 806 %12 = getelementptr inbounds i8, ptr %2, i64 2 807 %13 = load i32, ptr %12, align 4 808 %14 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %13, i64 0 809 %15 = bitcast <2 x i32> %14 to <8 x i8> 810 %16 = lshr i32 %13, 24 811 %17 = trunc i32 %16 to i8 812 %18 = insertelement <8 x i8> undef, i8 %17, i64 0 813 %19 = shufflevector <8 x i8> %18, <8 x i8> poison, <8 x i32> zeroinitializer 814 %20 = tail call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %7, <8 x i8> %11) 815 %21 = zext <8 x i8> %7 to <8 x i16> 816 %22 = zext <8 x i8> %11 to <8 x i16> 817 %23 = zext <8 x i8> %15 to <8 x i16> 818 %24 = shl nuw nsw <8 x i16> %22, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 819 %25 = add nuw nsw <8 x i16> %23, %21 820 %26 = add nuw nsw <8 x i16> %25, %24 821 %27 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %26, i32 2) 822 %28 = bitcast <8 x i8> %20 to <2 x i32> 823 %29 = extractelement <2 x i32> %28, i64 0 824 store i32 %29, ptr %0, align 4 825 %30 = bitcast <8 x i8> %27 to <2 x i32> 826 %31 = getelementptr inbounds i8, ptr %0, i64 %1 827 %32 = extractelement <2 x i32> %30, i64 0 828 store i32 %32, ptr %31, align 4 829 %33 = bitcast <8 x i8> %19 to <2 x i32> 830 %34 = shufflevector <2 x i32> %28, <2 x i32> %33, <2 x i32> <i32 0, i32 2> 831 %35 = bitcast <2 x i32> %34 to <8 x i8> 832 %36 = shufflevector <2 x i32> %30, <2 x i32> %33, <2 x i32> <i32 0, i32 2> 833 %37 = bitcast <2 x i32> %36 to <8 x i8> 834 %38 = shufflevector <8 x i8> %35, <8 x i8> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 undef, i32 undef, i32 undef, i32 undef> 835 %39 = bitcast <8 x i8> %38 to <2 x i32> 836 %40 = shl nsw i64 %1, 1 837 %41 = getelementptr inbounds i8, ptr %0, i64 %40 838 %42 = extractelement <2 x i32> %39, i64 0 839 store i32 %42, ptr %41, align 4 840 %43 = shufflevector <8 x i8> %37, <8 x i8> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 undef, i32 undef, i32 undef, i32 undef> 841 %44 = bitcast <8 x i8> %43 to <2 x i32> 842 %45 = mul nsw i64 %1, 3 843 %46 = getelementptr inbounds i8, ptr %0, i64 %45 844 %47 = extractelement <2 x i32> %44, i64 0 845 store i32 %47, ptr %46, align 4 846 ret void 847} 848 849define void @predictor_4x4_neon_new(ptr nocapture noundef writeonly %0, i64 noundef %1, ptr nocapture noundef readonly %2, ptr nocapture noundef readnone %3) { 850; CHECK-LABEL: predictor_4x4_neon_new: 851; CHECK: // %bb.0: 852; CHECK-NEXT: ldr s0, [x2] 853; CHECK-NEXT: ldur s1, [x2, #1] 854; CHECK-NEXT: lsl x8, x1, #1 855; CHECK-NEXT: ldur s2, [x2, #2] 856; CHECK-NEXT: ldur s3, [x2, #3] 857; CHECK-NEXT: uaddl v4.8h, v1.8b, v0.8b 858; CHECK-NEXT: urhadd v0.8b, v0.8b, v1.8b 859; CHECK-NEXT: add x9, x8, x1 860; CHECK-NEXT: uaddl v5.8h, v2.8b, v1.8b 861; CHECK-NEXT: uaddl v3.8h, v3.8b, v2.8b 862; CHECK-NEXT: urhadd v1.8b, v1.8b, v2.8b 863; CHECK-NEXT: str s0, [x0] 864; CHECK-NEXT: add v4.8h, v4.8h, v5.8h 865; CHECK-NEXT: add v3.8h, v3.8h, v5.8h 866; CHECK-NEXT: rshrn v4.8b, v4.8h, #2 867; CHECK-NEXT: rshrn v0.8b, v3.8h, #2 868; CHECK-NEXT: str s4, [x0, x1] 869; CHECK-NEXT: str s1, [x0, x8] 870; CHECK-NEXT: str s0, [x0, x9] 871; CHECK-NEXT: ret 872 %5 = load i32, ptr %2, align 4 873 %6 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %5, i64 0 874 %7 = bitcast <2 x i32> %6 to <8 x i8> 875 %8 = getelementptr inbounds i8, ptr %2, i64 1 876 %9 = load i32, ptr %8, align 4 877 %10 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %9, i64 0 878 %11 = bitcast <2 x i32> %10 to <8 x i8> 879 %12 = getelementptr inbounds i8, ptr %2, i64 2 880 %13 = load i32, ptr %12, align 4 881 %14 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %13, i64 0 882 %15 = bitcast <2 x i32> %14 to <8 x i8> 883 %16 = getelementptr inbounds i8, ptr %2, i64 3 884 %17 = load i32, ptr %16, align 4 885 %18 = insertelement <2 x i32> <i32 poison, i32 0>, i32 %17, i64 0 886 %19 = bitcast <2 x i32> %18 to <8 x i8> 887 %20 = tail call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %7, <8 x i8> %11) 888 %21 = tail call <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8> %11, <8 x i8> %15) 889 %22 = zext <8 x i8> %7 to <8 x i16> 890 %23 = zext <8 x i8> %11 to <8 x i16> 891 %24 = add nuw nsw <8 x i16> %23, %22 892 %25 = zext <8 x i8> %15 to <8 x i16> 893 %26 = add nuw nsw <8 x i16> %25, %23 894 %27 = add nuw nsw <8 x i16> %24, %26 895 %28 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %27, i32 2) 896 %29 = zext <8 x i8> %19 to <8 x i16> 897 %30 = add nuw nsw <8 x i16> %29, %25 898 %31 = add nuw nsw <8 x i16> %30, %26 899 %32 = tail call <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16> %31, i32 2) 900 %33 = bitcast <8 x i8> %20 to <2 x i32> 901 %34 = extractelement <2 x i32> %33, i64 0 902 store i32 %34, ptr %0, align 4 903 %35 = bitcast <8 x i8> %28 to <2 x i32> 904 %36 = getelementptr inbounds i8, ptr %0, i64 %1 905 %37 = extractelement <2 x i32> %35, i64 0 906 store i32 %37, ptr %36, align 4 907 %38 = bitcast <8 x i8> %21 to <2 x i32> 908 %39 = shl nsw i64 %1, 1 909 %40 = getelementptr inbounds i8, ptr %0, i64 %39 910 %41 = extractelement <2 x i32> %38, i64 0 911 store i32 %41, ptr %40, align 4 912 %42 = bitcast <8 x i8> %32 to <2 x i32> 913 %43 = mul nsw i64 %1, 3 914 %44 = getelementptr inbounds i8, ptr %0, i64 %43 915 %45 = extractelement <2 x i32> %42, i64 0 916 store i32 %45, ptr %44, align 4 917 ret void 918} 919 920 921define <vscale x 8 x i8> @loadnxv8i8(ptr %p) { 922; CHECK-LABEL: loadnxv8i8: 923; CHECK: // %bb.0: 924; CHECK-NEXT: mov z0.h, #0 // =0x0 925; CHECK-NEXT: ldrb w8, [x0] 926; CHECK-NEXT: ptrue p0.h, vl1 927; CHECK-NEXT: mov z0.h, p0/m, w8 928; CHECK-NEXT: ret 929 %l = load i8, ptr %p 930 %v = insertelement <vscale x 8 x i8> zeroinitializer, i8 %l, i32 0 931 ret <vscale x 8 x i8> %v 932} 933 934define <vscale x 16 x i8> @loadnxv16i8(ptr %p) { 935; CHECK-LABEL: loadnxv16i8: 936; CHECK: // %bb.0: 937; CHECK-NEXT: ldr b0, [x0] 938; CHECK-NEXT: ret 939 %l = load i8, ptr %p 940 %v = insertelement <vscale x 16 x i8> zeroinitializer, i8 %l, i32 0 941 ret <vscale x 16 x i8> %v 942} 943 944define <vscale x 4 x i16> @loadnxv4i16(ptr %p) { 945; CHECK-LABEL: loadnxv4i16: 946; CHECK: // %bb.0: 947; CHECK-NEXT: mov z0.s, #0 // =0x0 948; CHECK-NEXT: ldrh w8, [x0] 949; CHECK-NEXT: ptrue p0.s, vl1 950; CHECK-NEXT: mov z0.s, p0/m, w8 951; CHECK-NEXT: ret 952 %l = load i16, ptr %p 953 %v = insertelement <vscale x 4 x i16> zeroinitializer, i16 %l, i32 0 954 ret <vscale x 4 x i16> %v 955} 956 957define <vscale x 8 x i16> @loadnxv8i16(ptr %p) { 958; CHECK-LABEL: loadnxv8i16: 959; CHECK: // %bb.0: 960; CHECK-NEXT: ldr h0, [x0] 961; CHECK-NEXT: ret 962 %l = load i16, ptr %p 963 %v = insertelement <vscale x 8 x i16> zeroinitializer, i16 %l, i32 0 964 ret <vscale x 8 x i16> %v 965} 966 967define <vscale x 2 x i32> @loadnxv2i32(ptr %p) { 968; CHECK-LABEL: loadnxv2i32: 969; CHECK: // %bb.0: 970; CHECK-NEXT: mov z0.d, #0 // =0x0 971; CHECK-NEXT: ldr w8, [x0] 972; CHECK-NEXT: ptrue p0.d, vl1 973; CHECK-NEXT: mov z0.d, p0/m, x8 974; CHECK-NEXT: ret 975 %l = load i32, ptr %p 976 %v = insertelement <vscale x 2 x i32> zeroinitializer, i32 %l, i32 0 977 ret <vscale x 2 x i32> %v 978} 979 980define <vscale x 4 x i32> @loadnxv4i32(ptr %p) { 981; CHECK-LABEL: loadnxv4i32: 982; CHECK: // %bb.0: 983; CHECK-NEXT: ldr s0, [x0] 984; CHECK-NEXT: ret 985 %l = load i32, ptr %p 986 %v = insertelement <vscale x 4 x i32> zeroinitializer, i32 %l, i32 0 987 ret <vscale x 4 x i32> %v 988} 989 990define <vscale x 2 x i64> @loadnxv2i64(ptr %p) { 991; CHECK-LABEL: loadnxv2i64: 992; CHECK: // %bb.0: 993; CHECK-NEXT: ldr d0, [x0] 994; CHECK-NEXT: ret 995 %l = load i64, ptr %p 996 %v = insertelement <vscale x 2 x i64> zeroinitializer, i64 %l, i32 0 997 ret <vscale x 2 x i64> %v 998} 999 1000 1001define <vscale x 4 x half> @loadnxv4f16(ptr %p) { 1002; CHECK-LABEL: loadnxv4f16: 1003; CHECK: // %bb.0: 1004; CHECK-NEXT: mov w8, wzr 1005; CHECK-NEXT: index z0.s, #0, #1 1006; CHECK-NEXT: ptrue p0.s 1007; CHECK-NEXT: mov z1.s, w8 1008; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s 1009; CHECK-NEXT: mov z0.h, #0 // =0x0 1010; CHECK-NEXT: ldr h1, [x0] 1011; CHECK-NEXT: mov z0.h, p0/m, h1 1012; CHECK-NEXT: ret 1013 %l = load half, ptr %p 1014 %v = insertelement <vscale x 4 x half> zeroinitializer, half %l, i32 0 1015 ret <vscale x 4 x half> %v 1016} 1017 1018define <vscale x 8 x half> @loadnxv8f16(ptr %p) { 1019; CHECK-LABEL: loadnxv8f16: 1020; CHECK: // %bb.0: 1021; CHECK-NEXT: ldr h0, [x0] 1022; CHECK-NEXT: ret 1023 %l = load half, ptr %p 1024 %v = insertelement <vscale x 8 x half> zeroinitializer, half %l, i32 0 1025 ret <vscale x 8 x half> %v 1026} 1027 1028define <vscale x 4 x bfloat> @loadnxv4bf16(ptr %p) { 1029; CHECK-LABEL: loadnxv4bf16: 1030; CHECK: // %bb.0: 1031; CHECK-NEXT: mov w8, wzr 1032; CHECK-NEXT: index z0.s, #0, #1 1033; CHECK-NEXT: ptrue p0.s 1034; CHECK-NEXT: mov z1.s, w8 1035; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s 1036; CHECK-NEXT: mov z0.h, #0 // =0x0 1037; CHECK-NEXT: ldr h1, [x0] 1038; CHECK-NEXT: mov z0.h, p0/m, h1 1039; CHECK-NEXT: ret 1040 %l = load bfloat, ptr %p 1041 %v = insertelement <vscale x 4 x bfloat> zeroinitializer, bfloat %l, i32 0 1042 ret <vscale x 4 x bfloat> %v 1043} 1044 1045define <vscale x 8 x bfloat> @loadnxv8bf16(ptr %p) { 1046; CHECK-LABEL: loadnxv8bf16: 1047; CHECK: // %bb.0: 1048; CHECK-NEXT: ldr h0, [x0] 1049; CHECK-NEXT: ret 1050 %l = load bfloat, ptr %p 1051 %v = insertelement <vscale x 8 x bfloat> zeroinitializer, bfloat %l, i32 0 1052 ret <vscale x 8 x bfloat> %v 1053} 1054 1055define <vscale x 2 x float> @loadnxv2f32(ptr %p) { 1056; CHECK-LABEL: loadnxv2f32: 1057; CHECK: // %bb.0: 1058; CHECK-NEXT: mov x8, xzr 1059; CHECK-NEXT: index z0.d, #0, #1 1060; CHECK-NEXT: ptrue p0.d 1061; CHECK-NEXT: mov z1.d, x8 1062; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d 1063; CHECK-NEXT: mov z0.s, #0 // =0x0 1064; CHECK-NEXT: ldr s1, [x0] 1065; CHECK-NEXT: mov z0.s, p0/m, s1 1066; CHECK-NEXT: ret 1067 %l = load float, ptr %p 1068 %v = insertelement <vscale x 2 x float> zeroinitializer, float %l, i32 0 1069 ret <vscale x 2 x float> %v 1070} 1071 1072define <vscale x 4 x float> @loadnxv4f32(ptr %p) { 1073; CHECK-LABEL: loadnxv4f32: 1074; CHECK: // %bb.0: 1075; CHECK-NEXT: ldr s0, [x0] 1076; CHECK-NEXT: ret 1077 %l = load float, ptr %p 1078 %v = insertelement <vscale x 4 x float> zeroinitializer, float %l, i32 0 1079 ret <vscale x 4 x float> %v 1080} 1081 1082define <vscale x 2 x double> @loadnxv2f64(ptr %p) { 1083; CHECK-LABEL: loadnxv2f64: 1084; CHECK: // %bb.0: 1085; CHECK-NEXT: ldr d0, [x0] 1086; CHECK-NEXT: ret 1087 %l = load double, ptr %p 1088 %v = insertelement <vscale x 2 x double> zeroinitializer, double %l, i32 0 1089 ret <vscale x 2 x double> %v 1090} 1091 1092 1093; Unscaled 1094 1095define <vscale x 8 x i8> @loadnxv8i8_offset(ptr %p) { 1096; CHECK-LABEL: loadnxv8i8_offset: 1097; CHECK: // %bb.0: 1098; CHECK-NEXT: mov z0.h, #0 // =0x0 1099; CHECK-NEXT: ldrb w8, [x0, #1] 1100; CHECK-NEXT: ptrue p0.h, vl1 1101; CHECK-NEXT: mov z0.h, p0/m, w8 1102; CHECK-NEXT: ret 1103 %g = getelementptr inbounds i8, ptr %p, i64 1 1104 %l = load i8, ptr %g 1105 %v = insertelement <vscale x 8 x i8> zeroinitializer, i8 %l, i32 0 1106 ret <vscale x 8 x i8> %v 1107} 1108 1109define <vscale x 16 x i8> @loadnxv16i8_offset(ptr %p) { 1110; CHECK-LABEL: loadnxv16i8_offset: 1111; CHECK: // %bb.0: 1112; CHECK-NEXT: ldr b0, [x0, #1] 1113; CHECK-NEXT: ret 1114 %g = getelementptr inbounds i8, ptr %p, i64 1 1115 %l = load i8, ptr %g 1116 %v = insertelement <vscale x 16 x i8> zeroinitializer, i8 %l, i32 0 1117 ret <vscale x 16 x i8> %v 1118} 1119 1120define <vscale x 4 x i16> @loadnxv4i16_offset(ptr %p) { 1121; CHECK-LABEL: loadnxv4i16_offset: 1122; CHECK: // %bb.0: 1123; CHECK-NEXT: mov z0.s, #0 // =0x0 1124; CHECK-NEXT: ldurh w8, [x0, #1] 1125; CHECK-NEXT: ptrue p0.s, vl1 1126; CHECK-NEXT: mov z0.s, p0/m, w8 1127; CHECK-NEXT: ret 1128 %g = getelementptr inbounds i8, ptr %p, i64 1 1129 %l = load i16, ptr %g 1130 %v = insertelement <vscale x 4 x i16> zeroinitializer, i16 %l, i32 0 1131 ret <vscale x 4 x i16> %v 1132} 1133 1134define <vscale x 8 x i16> @loadnxv8i16_offset(ptr %p) { 1135; CHECK-LABEL: loadnxv8i16_offset: 1136; CHECK: // %bb.0: 1137; CHECK-NEXT: ldur h0, [x0, #1] 1138; CHECK-NEXT: ret 1139 %g = getelementptr inbounds i8, ptr %p, i64 1 1140 %l = load i16, ptr %g 1141 %v = insertelement <vscale x 8 x i16> zeroinitializer, i16 %l, i32 0 1142 ret <vscale x 8 x i16> %v 1143} 1144 1145define <vscale x 2 x i32> @loadnxv2i32_offset(ptr %p) { 1146; CHECK-LABEL: loadnxv2i32_offset: 1147; CHECK: // %bb.0: 1148; CHECK-NEXT: mov z0.d, #0 // =0x0 1149; CHECK-NEXT: ldur w8, [x0, #1] 1150; CHECK-NEXT: ptrue p0.d, vl1 1151; CHECK-NEXT: mov z0.d, p0/m, x8 1152; CHECK-NEXT: ret 1153 %g = getelementptr inbounds i8, ptr %p, i64 1 1154 %l = load i32, ptr %g 1155 %v = insertelement <vscale x 2 x i32> zeroinitializer, i32 %l, i32 0 1156 ret <vscale x 2 x i32> %v 1157} 1158 1159define <vscale x 4 x i32> @loadnxv4i32_offset(ptr %p) { 1160; CHECK-LABEL: loadnxv4i32_offset: 1161; CHECK: // %bb.0: 1162; CHECK-NEXT: ldur s0, [x0, #1] 1163; CHECK-NEXT: ret 1164 %g = getelementptr inbounds i8, ptr %p, i64 1 1165 %l = load i32, ptr %g 1166 %v = insertelement <vscale x 4 x i32> zeroinitializer, i32 %l, i32 0 1167 ret <vscale x 4 x i32> %v 1168} 1169 1170define <vscale x 2 x i64> @loadnxv2i64_offset(ptr %p) { 1171; CHECK-LABEL: loadnxv2i64_offset: 1172; CHECK: // %bb.0: 1173; CHECK-NEXT: ldur d0, [x0, #1] 1174; CHECK-NEXT: ret 1175 %g = getelementptr inbounds i8, ptr %p, i64 1 1176 %l = load i64, ptr %g 1177 %v = insertelement <vscale x 2 x i64> zeroinitializer, i64 %l, i32 0 1178 ret <vscale x 2 x i64> %v 1179} 1180 1181 1182define <vscale x 4 x half> @loadnxv4f16_offset(ptr %p) { 1183; CHECK-LABEL: loadnxv4f16_offset: 1184; CHECK: // %bb.0: 1185; CHECK-NEXT: mov w8, wzr 1186; CHECK-NEXT: index z0.s, #0, #1 1187; CHECK-NEXT: ptrue p0.s 1188; CHECK-NEXT: mov z1.s, w8 1189; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s 1190; CHECK-NEXT: mov z0.h, #0 // =0x0 1191; CHECK-NEXT: ldur h1, [x0, #1] 1192; CHECK-NEXT: mov z0.h, p0/m, h1 1193; CHECK-NEXT: ret 1194 %g = getelementptr inbounds i8, ptr %p, i64 1 1195 %l = load half, ptr %g 1196 %v = insertelement <vscale x 4 x half> zeroinitializer, half %l, i32 0 1197 ret <vscale x 4 x half> %v 1198} 1199 1200define <vscale x 8 x half> @loadnxv8f16_offset(ptr %p) { 1201; CHECK-LABEL: loadnxv8f16_offset: 1202; CHECK: // %bb.0: 1203; CHECK-NEXT: ldur h0, [x0, #1] 1204; CHECK-NEXT: ret 1205 %g = getelementptr inbounds i8, ptr %p, i64 1 1206 %l = load half, ptr %g 1207 %v = insertelement <vscale x 8 x half> zeroinitializer, half %l, i32 0 1208 ret <vscale x 8 x half> %v 1209} 1210 1211define <vscale x 4 x bfloat> @loadnxv4bf16_offset(ptr %p) { 1212; CHECK-LABEL: loadnxv4bf16_offset: 1213; CHECK: // %bb.0: 1214; CHECK-NEXT: mov w8, wzr 1215; CHECK-NEXT: index z0.s, #0, #1 1216; CHECK-NEXT: ptrue p0.s 1217; CHECK-NEXT: mov z1.s, w8 1218; CHECK-NEXT: cmpeq p0.s, p0/z, z0.s, z1.s 1219; CHECK-NEXT: mov z0.h, #0 // =0x0 1220; CHECK-NEXT: ldur h1, [x0, #1] 1221; CHECK-NEXT: mov z0.h, p0/m, h1 1222; CHECK-NEXT: ret 1223 %g = getelementptr inbounds i8, ptr %p, i64 1 1224 %l = load bfloat, ptr %g 1225 %v = insertelement <vscale x 4 x bfloat> zeroinitializer, bfloat %l, i32 0 1226 ret <vscale x 4 x bfloat> %v 1227} 1228 1229define <vscale x 8 x bfloat> @loadnxv8bf16_offset(ptr %p) { 1230; CHECK-LABEL: loadnxv8bf16_offset: 1231; CHECK: // %bb.0: 1232; CHECK-NEXT: ldur h0, [x0, #1] 1233; CHECK-NEXT: ret 1234 %g = getelementptr inbounds i8, ptr %p, i64 1 1235 %l = load bfloat, ptr %g 1236 %v = insertelement <vscale x 8 x bfloat> zeroinitializer, bfloat %l, i32 0 1237 ret <vscale x 8 x bfloat> %v 1238} 1239 1240define <vscale x 2 x float> @loadnxv2f32_offset(ptr %p) { 1241; CHECK-LABEL: loadnxv2f32_offset: 1242; CHECK: // %bb.0: 1243; CHECK-NEXT: mov x8, xzr 1244; CHECK-NEXT: index z0.d, #0, #1 1245; CHECK-NEXT: ptrue p0.d 1246; CHECK-NEXT: mov z1.d, x8 1247; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, z1.d 1248; CHECK-NEXT: mov z0.s, #0 // =0x0 1249; CHECK-NEXT: ldur s1, [x0, #1] 1250; CHECK-NEXT: mov z0.s, p0/m, s1 1251; CHECK-NEXT: ret 1252 %g = getelementptr inbounds i8, ptr %p, i64 1 1253 %l = load float, ptr %g 1254 %v = insertelement <vscale x 2 x float> zeroinitializer, float %l, i32 0 1255 ret <vscale x 2 x float> %v 1256} 1257 1258define <vscale x 4 x float> @loadnxv4f32_offset(ptr %p) { 1259; CHECK-LABEL: loadnxv4f32_offset: 1260; CHECK: // %bb.0: 1261; CHECK-NEXT: ldur s0, [x0, #1] 1262; CHECK-NEXT: ret 1263 %g = getelementptr inbounds i8, ptr %p, i64 1 1264 %l = load float, ptr %g 1265 %v = insertelement <vscale x 4 x float> zeroinitializer, float %l, i32 0 1266 ret <vscale x 4 x float> %v 1267} 1268 1269define <vscale x 2 x double> @loadnxv2f64_offset(ptr %p) { 1270; CHECK-LABEL: loadnxv2f64_offset: 1271; CHECK: // %bb.0: 1272; CHECK-NEXT: ldur d0, [x0, #1] 1273; CHECK-NEXT: ret 1274 %g = getelementptr inbounds i8, ptr %p, i64 1 1275 %l = load double, ptr %g 1276 %v = insertelement <vscale x 2 x double> zeroinitializer, double %l, i32 0 1277 ret <vscale x 2 x double> %v 1278} 1279 1280 1281declare <8 x i8> @llvm.aarch64.neon.rshrn.v8i8(<8 x i16>, i32) #1 1282declare <8 x i8> @llvm.aarch64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>) #1 1283