1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=aarch64 -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,CHECK-SD 3; RUN: llc < %s -mtriple=aarch64 -aarch64-neon-syntax=apple -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI 4 5define i32 @test_rev_w(i32 %a) nounwind { 6; CHECK-LABEL: test_rev_w: 7; CHECK: // %bb.0: // %entry 8; CHECK-NEXT: rev w0, w0 9; CHECK-NEXT: ret 10entry: 11 %0 = tail call i32 @llvm.bswap.i32(i32 %a) 12 ret i32 %0 13} 14 15define i64 @test_rev_x(i64 %a) nounwind { 16; CHECK-LABEL: test_rev_x: 17; CHECK: // %bb.0: // %entry 18; CHECK-NEXT: rev x0, x0 19; CHECK-NEXT: ret 20entry: 21 %0 = tail call i64 @llvm.bswap.i64(i64 %a) 22 ret i64 %0 23} 24 25; Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high 16-bits 26; of %a are zero. This optimizes rev + lsr 16 to rev16. 27define i32 @test_rev_w_srl16(i16 %a) { 28; CHECK-SD-LABEL: test_rev_w_srl16: 29; CHECK-SD: // %bb.0: // %entry 30; CHECK-SD-NEXT: rev w8, w0 31; CHECK-SD-NEXT: lsr w0, w8, #16 32; CHECK-SD-NEXT: ret 33; 34; CHECK-GI-LABEL: test_rev_w_srl16: 35; CHECK-GI: // %bb.0: // %entry 36; CHECK-GI-NEXT: and w8, w0, #0xffff 37; CHECK-GI-NEXT: rev w8, w8 38; CHECK-GI-NEXT: lsr w0, w8, #16 39; CHECK-GI-NEXT: ret 40entry: 41 %0 = zext i16 %a to i32 42 %1 = tail call i32 @llvm.bswap.i32(i32 %0) 43 %2 = lshr i32 %1, 16 44 ret i32 %2 45} 46 47define i32 @test_rev_w_srl16_load(ptr %a) { 48; CHECK-LABEL: test_rev_w_srl16_load: 49; CHECK: // %bb.0: // %entry 50; CHECK-NEXT: ldrh w8, [x0] 51; CHECK-NEXT: rev w8, w8 52; CHECK-NEXT: lsr w0, w8, #16 53; CHECK-NEXT: ret 54entry: 55 %0 = load i16, ptr %a 56 %1 = zext i16 %0 to i32 57 %2 = tail call i32 @llvm.bswap.i32(i32 %1) 58 %3 = lshr i32 %2, 16 59 ret i32 %3 60} 61 62define i32 @test_rev_w_srl16_add(i8 %a, i8 %b) { 63; CHECK-SD-LABEL: test_rev_w_srl16_add: 64; CHECK-SD: // %bb.0: // %entry 65; CHECK-SD-NEXT: and w8, w0, #0xff 66; CHECK-SD-NEXT: add w8, w8, w1, uxtb 67; CHECK-SD-NEXT: rev16 w0, w8 68; CHECK-SD-NEXT: ret 69; 70; CHECK-GI-LABEL: test_rev_w_srl16_add: 71; CHECK-GI: // %bb.0: // %entry 72; CHECK-GI-NEXT: and w8, w1, #0xff 73; CHECK-GI-NEXT: add w8, w8, w0, uxtb 74; CHECK-GI-NEXT: rev w8, w8 75; CHECK-GI-NEXT: lsr w0, w8, #16 76; CHECK-GI-NEXT: ret 77entry: 78 %0 = zext i8 %a to i32 79 %1 = zext i8 %b to i32 80 %2 = add i32 %0, %1 81 %3 = tail call i32 @llvm.bswap.i32(i32 %2) 82 %4 = lshr i32 %3, 16 83 ret i32 %4 84} 85 86; Canonicalize (srl (bswap x), 32) to (rotr (bswap x), 32) if the high 32-bits 87; of %a are zero. This optimizes rev + lsr 32 to rev32. 88define i64 @test_rev_x_srl32(i32 %a) { 89; CHECK-SD-LABEL: test_rev_x_srl32: 90; CHECK-SD: // %bb.0: // %entry 91; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 92; CHECK-SD-NEXT: rev x8, x0 93; CHECK-SD-NEXT: lsr x0, x8, #32 94; CHECK-SD-NEXT: ret 95; 96; CHECK-GI-LABEL: test_rev_x_srl32: 97; CHECK-GI: // %bb.0: // %entry 98; CHECK-GI-NEXT: mov w8, w0 99; CHECK-GI-NEXT: rev x8, x8 100; CHECK-GI-NEXT: lsr x0, x8, #32 101; CHECK-GI-NEXT: ret 102entry: 103 %0 = zext i32 %a to i64 104 %1 = tail call i64 @llvm.bswap.i64(i64 %0) 105 %2 = lshr i64 %1, 32 106 ret i64 %2 107} 108 109define i64 @test_rev_x_srl32_load(ptr %a) { 110; CHECK-LABEL: test_rev_x_srl32_load: 111; CHECK: // %bb.0: // %entry 112; CHECK-NEXT: ldr w8, [x0] 113; CHECK-NEXT: rev x8, x8 114; CHECK-NEXT: lsr x0, x8, #32 115; CHECK-NEXT: ret 116entry: 117 %0 = load i32, ptr %a 118 %1 = zext i32 %0 to i64 119 %2 = tail call i64 @llvm.bswap.i64(i64 %1) 120 %3 = lshr i64 %2, 32 121 ret i64 %3 122} 123 124define i64 @test_rev_x_srl32_shift(i64 %a) { 125; CHECK-SD-LABEL: test_rev_x_srl32_shift: 126; CHECK-SD: // %bb.0: // %entry 127; CHECK-SD-NEXT: ubfx x8, x0, #2, #29 128; CHECK-SD-NEXT: rev32 x0, x8 129; CHECK-SD-NEXT: ret 130; 131; CHECK-GI-LABEL: test_rev_x_srl32_shift: 132; CHECK-GI: // %bb.0: // %entry 133; CHECK-GI-NEXT: ubfx x8, x0, #2, #29 134; CHECK-GI-NEXT: rev x8, x8 135; CHECK-GI-NEXT: lsr x0, x8, #32 136; CHECK-GI-NEXT: ret 137entry: 138 %0 = shl i64 %a, 33 139 %1 = lshr i64 %0, 35 140 %2 = tail call i64 @llvm.bswap.i64(i64 %1) 141 %3 = lshr i64 %2, 32 142 ret i64 %3 143} 144 145declare i32 @llvm.bswap.i32(i32) nounwind readnone 146declare i64 @llvm.bswap.i64(i64) nounwind readnone 147 148define i32 @test_rev16_w(i32 %X) nounwind { 149; CHECK-SD-LABEL: test_rev16_w: 150; CHECK-SD: // %bb.0: // %entry 151; CHECK-SD-NEXT: rev16 w0, w0 152; CHECK-SD-NEXT: ret 153; 154; CHECK-GI-LABEL: test_rev16_w: 155; CHECK-GI: // %bb.0: // %entry 156; CHECK-GI-NEXT: lsr w8, w0, #8 157; CHECK-GI-NEXT: lsl w9, w0, #8 158; CHECK-GI-NEXT: and w10, w8, #0xff0000 159; CHECK-GI-NEXT: and w11, w9, #0xff000000 160; CHECK-GI-NEXT: and w8, w8, #0xff 161; CHECK-GI-NEXT: and w9, w9, #0xff00 162; CHECK-GI-NEXT: orr w10, w11, w10 163; CHECK-GI-NEXT: orr w8, w9, w8 164; CHECK-GI-NEXT: orr w0, w10, w8 165; CHECK-GI-NEXT: ret 166entry: 167 %tmp1 = lshr i32 %X, 8 168 %X15 = bitcast i32 %X to i32 169 %tmp4 = shl i32 %X15, 8 170 %tmp2 = and i32 %tmp1, 16711680 171 %tmp5 = and i32 %tmp4, -16777216 172 %tmp9 = and i32 %tmp1, 255 173 %tmp13 = and i32 %tmp4, 65280 174 %tmp6 = or i32 %tmp5, %tmp2 175 %tmp10 = or i32 %tmp6, %tmp13 176 %tmp14 = or i32 %tmp10, %tmp9 177 ret i32 %tmp14 178} 179 180; 64-bit REV16 is *not* a swap then a 16-bit rotation: 181; 01234567 ->(bswap) 76543210 ->(rotr) 10765432 182; 01234567 ->(rev16) 10325476 183define i64 @test_rev16_x(i64 %a) nounwind { 184; CHECK-LABEL: test_rev16_x: 185; CHECK: // %bb.0: // %entry 186; CHECK-NEXT: rev x8, x0 187; CHECK-NEXT: ror x0, x8, #16 188; CHECK-NEXT: ret 189entry: 190 %0 = tail call i64 @llvm.bswap.i64(i64 %a) 191 %1 = lshr i64 %0, 16 192 %2 = shl i64 %0, 48 193 %3 = or i64 %1, %2 194 ret i64 %3 195} 196 197define i64 @test_rev32_x(i64 %a) nounwind { 198; CHECK-LABEL: test_rev32_x: 199; CHECK: // %bb.0: // %entry 200; CHECK-NEXT: rev32 x0, x0 201; CHECK-NEXT: ret 202entry: 203 %0 = tail call i64 @llvm.bswap.i64(i64 %a) 204 %1 = lshr i64 %0, 32 205 %2 = shl i64 %0, 32 206 %3 = or i64 %1, %2 207 ret i64 %3 208} 209 210define <8 x i8> @test_vrev64D8(ptr %A) nounwind { 211; CHECK-LABEL: test_vrev64D8: 212; CHECK: // %bb.0: 213; CHECK-NEXT: ldr d0, [x0] 214; CHECK-NEXT: rev64.8b v0, v0 215; CHECK-NEXT: ret 216 %tmp1 = load <8 x i8>, ptr %A 217 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0> 218 ret <8 x i8> %tmp2 219} 220 221define <4 x i16> @test_vrev64D16(ptr %A) nounwind { 222; CHECK-LABEL: test_vrev64D16: 223; CHECK: // %bb.0: 224; CHECK-NEXT: ldr d0, [x0] 225; CHECK-NEXT: rev64.4h v0, v0 226; CHECK-NEXT: ret 227 %tmp1 = load <4 x i16>, ptr %A 228 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 229 ret <4 x i16> %tmp2 230} 231 232define <2 x i32> @test_vrev64D32(ptr %A) nounwind { 233; CHECK-LABEL: test_vrev64D32: 234; CHECK: // %bb.0: 235; CHECK-NEXT: ldr d0, [x0] 236; CHECK-NEXT: rev64.2s v0, v0 237; CHECK-NEXT: ret 238 %tmp1 = load <2 x i32>, ptr %A 239 %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> <i32 1, i32 0> 240 ret <2 x i32> %tmp2 241} 242 243define <2 x float> @test_vrev64Df(ptr %A) nounwind { 244; CHECK-LABEL: test_vrev64Df: 245; CHECK: // %bb.0: 246; CHECK-NEXT: ldr d0, [x0] 247; CHECK-NEXT: rev64.2s v0, v0 248; CHECK-NEXT: ret 249 %tmp1 = load <2 x float>, ptr %A 250 %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> <i32 1, i32 0> 251 ret <2 x float> %tmp2 252} 253 254define <16 x i8> @test_vrev64Q8(ptr %A) nounwind { 255; CHECK-LABEL: test_vrev64Q8: 256; CHECK: // %bb.0: 257; CHECK-NEXT: ldr q0, [x0] 258; CHECK-NEXT: rev64.16b v0, v0 259; CHECK-NEXT: ret 260 %tmp1 = load <16 x i8>, ptr %A 261 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8> 262 ret <16 x i8> %tmp2 263} 264 265define <8 x i16> @test_vrev64Q16(ptr %A) nounwind { 266; CHECK-LABEL: test_vrev64Q16: 267; CHECK: // %bb.0: 268; CHECK-NEXT: ldr q0, [x0] 269; CHECK-NEXT: rev64.8h v0, v0 270; CHECK-NEXT: ret 271 %tmp1 = load <8 x i16>, ptr %A 272 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> 273 ret <8 x i16> %tmp2 274} 275 276define <4 x i32> @test_vrev64Q32(ptr %A) nounwind { 277; CHECK-LABEL: test_vrev64Q32: 278; CHECK: // %bb.0: 279; CHECK-NEXT: ldr q0, [x0] 280; CHECK-NEXT: rev64.4s v0, v0 281; CHECK-NEXT: ret 282 %tmp1 = load <4 x i32>, ptr %A 283 %tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 284 ret <4 x i32> %tmp2 285} 286 287define <4 x float> @test_vrev64Qf(ptr %A) nounwind { 288; CHECK-LABEL: test_vrev64Qf: 289; CHECK: // %bb.0: 290; CHECK-NEXT: ldr q0, [x0] 291; CHECK-NEXT: rev64.4s v0, v0 292; CHECK-NEXT: ret 293 %tmp1 = load <4 x float>, ptr %A 294 %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 295 ret <4 x float> %tmp2 296} 297 298define <8 x i8> @test_vrev32D8(ptr %A) nounwind { 299; CHECK-LABEL: test_vrev32D8: 300; CHECK: // %bb.0: 301; CHECK-NEXT: ldr d0, [x0] 302; CHECK-NEXT: rev32.8b v0, v0 303; CHECK-NEXT: ret 304 %tmp1 = load <8 x i8>, ptr %A 305 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4> 306 ret <8 x i8> %tmp2 307} 308 309define <4 x i16> @test_vrev32D16(ptr %A) nounwind { 310; CHECK-LABEL: test_vrev32D16: 311; CHECK: // %bb.0: 312; CHECK-NEXT: ldr d0, [x0] 313; CHECK-NEXT: rev32.4h v0, v0 314; CHECK-NEXT: ret 315 %tmp1 = load <4 x i16>, ptr %A 316 %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2> 317 ret <4 x i16> %tmp2 318} 319 320define <16 x i8> @test_vrev32Q8(ptr %A) nounwind { 321; CHECK-LABEL: test_vrev32Q8: 322; CHECK: // %bb.0: 323; CHECK-NEXT: ldr q0, [x0] 324; CHECK-NEXT: rev32.16b v0, v0 325; CHECK-NEXT: ret 326 %tmp1 = load <16 x i8>, ptr %A 327 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12> 328 ret <16 x i8> %tmp2 329} 330 331define <8 x i16> @test_vrev32Q16(ptr %A) nounwind { 332; CHECK-LABEL: test_vrev32Q16: 333; CHECK: // %bb.0: 334; CHECK-NEXT: ldr q0, [x0] 335; CHECK-NEXT: rev32.8h v0, v0 336; CHECK-NEXT: ret 337 %tmp1 = load <8 x i16>, ptr %A 338 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 339 ret <8 x i16> %tmp2 340} 341 342define <8 x i8> @test_vrev16D8(ptr %A) nounwind { 343; CHECK-LABEL: test_vrev16D8: 344; CHECK: // %bb.0: 345; CHECK-NEXT: ldr d0, [x0] 346; CHECK-NEXT: rev16.8b v0, v0 347; CHECK-NEXT: ret 348 %tmp1 = load <8 x i8>, ptr %A 349 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6> 350 ret <8 x i8> %tmp2 351} 352 353define <16 x i8> @test_vrev16Q8(ptr %A) nounwind { 354; CHECK-LABEL: test_vrev16Q8: 355; CHECK: // %bb.0: 356; CHECK-NEXT: ldr q0, [x0] 357; CHECK-NEXT: rev16.16b v0, v0 358; CHECK-NEXT: ret 359 %tmp1 = load <16 x i8>, ptr %A 360 %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14> 361 ret <16 x i8> %tmp2 362} 363 364; Undef shuffle indices should not prevent matching to VREV: 365 366define <8 x i8> @test_vrev64D8_undef(ptr %A) nounwind { 367; CHECK-LABEL: test_vrev64D8_undef: 368; CHECK: // %bb.0: 369; CHECK-NEXT: ldr d0, [x0] 370; CHECK-NEXT: rev64.8b v0, v0 371; CHECK-NEXT: ret 372 %tmp1 = load <8 x i8>, ptr %A 373 %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 undef, i32 undef, i32 4, i32 3, i32 2, i32 1, i32 0> 374 ret <8 x i8> %tmp2 375} 376 377define <8 x i16> @test_vrev32Q16_undef(ptr %A) nounwind { 378; CHECK-LABEL: test_vrev32Q16_undef: 379; CHECK: // %bb.0: 380; CHECK-NEXT: ldr q0, [x0] 381; CHECK-NEXT: rev32.8h v0, v0 382; CHECK-NEXT: ret 383 %tmp1 = load <8 x i16>, ptr %A 384 %tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 5, i32 4, i32 7, i32 undef> 385 ret <8 x i16> %tmp2 386} 387 388; vrev <4 x i16> should use REV32 and not REV64 389define void @test_vrev64(ptr nocapture %source, ptr nocapture %dst) nounwind ssp { 390; CHECK-SD-LABEL: test_vrev64: 391; CHECK-SD: // %bb.0: // %entry 392; CHECK-SD-NEXT: ldr q0, [x0] 393; CHECK-SD-NEXT: add x8, x1, #2 394; CHECK-SD-NEXT: st1.h { v0 }[5], [x8] 395; CHECK-SD-NEXT: st1.h { v0 }[6], [x1] 396; CHECK-SD-NEXT: ret 397; 398; CHECK-GI-LABEL: test_vrev64: 399; CHECK-GI: // %bb.0: // %entry 400; CHECK-GI-NEXT: ldr q0, [x0] 401; CHECK-GI-NEXT: add x8, x1, #2 402; CHECK-GI-NEXT: st1.h { v0 }[6], [x1] 403; CHECK-GI-NEXT: st1.h { v0 }[5], [x8] 404; CHECK-GI-NEXT: ret 405entry: 406 %tmp2 = load <8 x i16>, ptr %source, align 4 407 %tmp3 = extractelement <8 x i16> %tmp2, i32 6 408 %tmp5 = insertelement <2 x i16> undef, i16 %tmp3, i32 0 409 %tmp9 = extractelement <8 x i16> %tmp2, i32 5 410 %tmp11 = insertelement <2 x i16> %tmp5, i16 %tmp9, i32 1 411 store <2 x i16> %tmp11, ptr %dst, align 4 412 ret void 413} 414 415; Test vrev of float4 416define void @float_vrev64(ptr nocapture %source, ptr nocapture %dest) nounwind noinline ssp { 417; CHECK-SD-LABEL: float_vrev64: 418; CHECK-SD: // %bb.0: // %entry 419; CHECK-SD-NEXT: movi.2d v0, #0000000000000000 420; CHECK-SD-NEXT: add x8, x0, #12 421; CHECK-SD-NEXT: dup.4s v0, v0[0] 422; CHECK-SD-NEXT: ld1.s { v0 }[1], [x8] 423; CHECK-SD-NEXT: str q0, [x1, #176] 424; CHECK-SD-NEXT: ret 425; 426; CHECK-GI-LABEL: float_vrev64: 427; CHECK-GI: // %bb.0: // %entry 428; CHECK-GI-NEXT: movi d0, #0000000000000000 429; CHECK-GI-NEXT: adrp x8, .LCPI28_0 430; CHECK-GI-NEXT: ldr q1, [x0] 431; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI28_0] 432; CHECK-GI-NEXT: tbl.16b v0, { v0, v1 }, v2 433; CHECK-GI-NEXT: str q0, [x1, #176] 434; CHECK-GI-NEXT: ret 435entry: 436 %tmp2 = load <4 x float>, ptr %source, align 4 437 %tmp5 = shufflevector <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x float> %tmp2, <4 x i32> <i32 0, i32 7, i32 0, i32 0> 438 %arrayidx8 = getelementptr inbounds <4 x float>, ptr %dest, i32 11 439 store <4 x float> %tmp5, ptr %arrayidx8, align 4 440 ret void 441} 442 443 444define <4 x i32> @test_vrev32_bswap(<4 x i32> %source) nounwind { 445; CHECK-LABEL: test_vrev32_bswap: 446; CHECK: // %bb.0: 447; CHECK-NEXT: rev32.16b v0, v0 448; CHECK-NEXT: ret 449 %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %source) 450 ret <4 x i32> %bswap 451} 452 453declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) nounwind readnone 454 455; Reduced regression from D114354 456define void @test_rev16_truncstore() { 457; CHECK-SD-LABEL: test_rev16_truncstore: 458; CHECK-SD: // %bb.0: // %entry 459; CHECK-SD-NEXT: cbnz wzr, .LBB30_2 460; CHECK-SD-NEXT: .LBB30_1: // %cleanup 461; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 462; CHECK-SD-NEXT: ldrh w8, [x8] 463; CHECK-SD-NEXT: rev16 w8, w8 464; CHECK-SD-NEXT: strh w8, [x8] 465; CHECK-SD-NEXT: cbz wzr, .LBB30_1 466; CHECK-SD-NEXT: .LBB30_2: // %fail 467; CHECK-SD-NEXT: ret 468; 469; CHECK-GI-LABEL: test_rev16_truncstore: 470; CHECK-GI: // %bb.0: // %entry 471; CHECK-GI-NEXT: tbnz wzr, #0, .LBB30_2 472; CHECK-GI-NEXT: .LBB30_1: // %cleanup 473; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 474; CHECK-GI-NEXT: ldrh w8, [x8] 475; CHECK-GI-NEXT: rev w8, w8 476; CHECK-GI-NEXT: lsr w8, w8, #16 477; CHECK-GI-NEXT: strh w8, [x8] 478; CHECK-GI-NEXT: tbz wzr, #0, .LBB30_1 479; CHECK-GI-NEXT: .LBB30_2: // %fail 480; CHECK-GI-NEXT: ret 481entry: 482 br label %body 483 484body: 485 %out.6269.i = phi ptr [ undef, %cleanup ], [ undef, %entry ] 486 %0 = load i16, ptr undef, align 2 487 %1 = icmp eq i16 undef, -10240 488 br i1 %1, label %fail, label %cleanup 489 490cleanup: 491 %or130.i = call i16 @llvm.bswap.i16(i16 %0) 492 store i16 %or130.i, ptr %out.6269.i, align 2 493 br label %body 494 495fail: 496 ret void 497} 498declare i16 @llvm.bswap.i16(i16) 499 500; Reduced regression from D120192 501define void @test_bswap32_narrow(ptr %p0, ptr %p1) nounwind { 502; CHECK-SD-LABEL: test_bswap32_narrow: 503; CHECK-SD: // %bb.0: 504; CHECK-SD-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill 505; CHECK-SD-NEXT: ldrh w8, [x0, #2] 506; CHECK-SD-NEXT: mov x19, x1 507; CHECK-SD-NEXT: rev16 w0, w8 508; CHECK-SD-NEXT: bl gid_tbl_len 509; CHECK-SD-NEXT: strh wzr, [x19] 510; CHECK-SD-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload 511; CHECK-SD-NEXT: ret 512; 513; CHECK-GI-LABEL: test_bswap32_narrow: 514; CHECK-GI: // %bb.0: 515; CHECK-GI-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill 516; CHECK-GI-NEXT: ldr w8, [x0] 517; CHECK-GI-NEXT: mov x19, x1 518; CHECK-GI-NEXT: and w8, w8, #0xffff0000 519; CHECK-GI-NEXT: rev w0, w8 520; CHECK-GI-NEXT: bl gid_tbl_len 521; CHECK-GI-NEXT: strh wzr, [x19] 522; CHECK-GI-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload 523; CHECK-GI-NEXT: ret 524 %ld = load i32, ptr %p0, align 4 525 %and = and i32 %ld, -65536 526 %bswap = tail call i32 @llvm.bswap.i32(i32 %and) 527 %and16 = zext i32 %bswap to i64 528 %call17 = tail call i32 @gid_tbl_len(i64 %and16) 529 store i16 0, ptr %p1, align 4 530 ret void 531} 532declare i32 @gid_tbl_len(...) 533 534; 64-bit REV16 is *not* a swap then a 16-bit rotation: 535; 01234567 ->(bswap) 76543210 ->(rotr) 10765432 536; 01234567 ->(rev16) 10325476 537; Optimize patterns where rev16 can be generated for a 64-bit input. 538define i64 @test_rev16_x_hwbyteswaps(i64 %a) nounwind { 539; CHECK-LABEL: test_rev16_x_hwbyteswaps: 540; CHECK: // %bb.0: // %entry 541; CHECK-NEXT: rev16 x0, x0 542; CHECK-NEXT: ret 543entry: 544 %0 = lshr i64 %a, 8 545 %1 = and i64 %0, 71777214294589695 546 %2 = shl i64 %a, 8 547 %3 = and i64 %2, -71777214294589696 548 %4 = or i64 %1, %3 549 ret i64 %4 550} 551 552; Optimize pattern with multiple and/or to a simple pattern which can enable generation of rev16. 553define i64 @test_rev16_x_hwbyteswaps_complex1(i64 %a) nounwind { 554; CHECK-SD-LABEL: test_rev16_x_hwbyteswaps_complex1: 555; CHECK-SD: // %bb.0: // %entry 556; CHECK-SD-NEXT: lsr x8, x0, #8 557; CHECK-SD-NEXT: lsr x9, x0, #48 558; CHECK-SD-NEXT: and x10, x8, #0xff000000000000 559; CHECK-SD-NEXT: and x11, x8, #0xff00000000 560; CHECK-SD-NEXT: and x8, x8, #0xff0000 561; CHECK-SD-NEXT: bfi x10, x9, #56, #8 562; CHECK-SD-NEXT: lsr x9, x0, #32 563; CHECK-SD-NEXT: orr x10, x10, x11 564; CHECK-SD-NEXT: bfi x10, x9, #40, #8 565; CHECK-SD-NEXT: lsr x9, x0, #16 566; CHECK-SD-NEXT: orr x8, x10, x8 567; CHECK-SD-NEXT: bfi x8, x9, #24, #8 568; CHECK-SD-NEXT: ubfiz x9, x0, #8, #8 569; CHECK-SD-NEXT: bfxil x8, x0, #8, #8 570; CHECK-SD-NEXT: orr x0, x8, x9 571; CHECK-SD-NEXT: ret 572; 573; CHECK-GI-LABEL: test_rev16_x_hwbyteswaps_complex1: 574; CHECK-GI: // %bb.0: // %entry 575; CHECK-GI-NEXT: lsr x8, x0, #8 576; CHECK-GI-NEXT: lsl x9, x0, #8 577; CHECK-GI-NEXT: and x10, x8, #0xff000000000000 578; CHECK-GI-NEXT: and x11, x9, #0xff00000000000000 579; CHECK-GI-NEXT: and x12, x8, #0xff00000000 580; CHECK-GI-NEXT: and x13, x9, #0xff0000000000 581; CHECK-GI-NEXT: and x14, x8, #0xff0000 582; CHECK-GI-NEXT: orr x10, x10, x11 583; CHECK-GI-NEXT: and x11, x9, #0xff000000 584; CHECK-GI-NEXT: orr x12, x12, x13 585; CHECK-GI-NEXT: and x8, x8, #0xff 586; CHECK-GI-NEXT: orr x11, x14, x11 587; CHECK-GI-NEXT: orr x10, x10, x12 588; CHECK-GI-NEXT: and x9, x9, #0xff00 589; CHECK-GI-NEXT: orr x8, x11, x8 590; CHECK-GI-NEXT: orr x8, x10, x8 591; CHECK-GI-NEXT: orr x0, x8, x9 592; CHECK-GI-NEXT: ret 593entry: 594 %0 = lshr i64 %a, 8 595 %1 = and i64 %0, 71776119061217280 596 %2 = shl i64 %a, 8 597 %3 = and i64 %2, -72057594037927936 598 %4 = or i64 %1, %3 599 %5 = and i64 %0, 1095216660480 600 %6 = or i64 %4, %5 601 %7 = and i64 %2, 280375465082880 602 %8 = or i64 %6, %7 603 %9 = and i64 %0, 16711680 604 %10 = or i64 %8, %9 605 %11 = and i64 %2, 4278190080 606 %12 = or i64 %10, %11 607 %13 = and i64 %0, 255 608 %14 = or i64 %12, %13 609 %15 = and i64 %2, 65280 610 %16 = or i64 %14, %15 611 ret i64 %16 612} 613 614define i64 @test_rev16_x_hwbyteswaps_complex2(i64 %a) nounwind { 615; CHECK-SD-LABEL: test_rev16_x_hwbyteswaps_complex2: 616; CHECK-SD: // %bb.0: // %entry 617; CHECK-SD-NEXT: lsr x8, x0, #8 618; CHECK-SD-NEXT: lsr x9, x0, #48 619; CHECK-SD-NEXT: lsr x10, x0, #32 620; CHECK-SD-NEXT: and x8, x8, #0xff00ff00ff00ff 621; CHECK-SD-NEXT: bfi x8, x9, #56, #8 622; CHECK-SD-NEXT: lsr x9, x0, #16 623; CHECK-SD-NEXT: bfi x8, x10, #40, #8 624; CHECK-SD-NEXT: bfi x8, x9, #24, #8 625; CHECK-SD-NEXT: bfi x8, x0, #8, #8 626; CHECK-SD-NEXT: mov x0, x8 627; CHECK-SD-NEXT: ret 628; 629; CHECK-GI-LABEL: test_rev16_x_hwbyteswaps_complex2: 630; CHECK-GI: // %bb.0: // %entry 631; CHECK-GI-NEXT: lsr x8, x0, #8 632; CHECK-GI-NEXT: lsl x9, x0, #8 633; CHECK-GI-NEXT: and x10, x8, #0xff000000000000 634; CHECK-GI-NEXT: and x11, x8, #0xff00000000 635; CHECK-GI-NEXT: and x12, x8, #0xff0000 636; CHECK-GI-NEXT: and x8, x8, #0xff 637; CHECK-GI-NEXT: and x13, x9, #0xff00000000000000 638; CHECK-GI-NEXT: orr x10, x10, x11 639; CHECK-GI-NEXT: and x11, x9, #0xff0000000000 640; CHECK-GI-NEXT: orr x8, x12, x8 641; CHECK-GI-NEXT: and x12, x9, #0xff000000 642; CHECK-GI-NEXT: orr x11, x13, x11 643; CHECK-GI-NEXT: orr x8, x10, x8 644; CHECK-GI-NEXT: and x9, x9, #0xff00 645; CHECK-GI-NEXT: orr x10, x11, x12 646; CHECK-GI-NEXT: orr x8, x8, x10 647; CHECK-GI-NEXT: orr x0, x8, x9 648; CHECK-GI-NEXT: ret 649entry: 650 %0 = lshr i64 %a, 8 651 %1 = and i64 %0, 71776119061217280 652 %2 = shl i64 %a, 8 653 %3 = and i64 %0, 1095216660480 654 %4 = or i64 %1, %3 655 %5 = and i64 %0, 16711680 656 %6 = or i64 %4, %5 657 %7 = and i64 %0, 255 658 %8 = or i64 %6, %7 659 %9 = and i64 %2, -72057594037927936 660 %10 = or i64 %8, %9 661 %11 = and i64 %2, 280375465082880 662 %12 = or i64 %10, %11 663 %13 = and i64 %2, 4278190080 664 %14 = or i64 %12, %13 665 %15 = and i64 %2, 65280 666 %16 = or i64 %14, %15 667 ret i64 %16 668} 669 670; Optimize pattern with multiple and/or to a simple pattern which can enable generation of rev16. 671define i64 @test_rev16_x_hwbyteswaps_complex3(i64 %a) nounwind { 672; CHECK-SD-LABEL: test_rev16_x_hwbyteswaps_complex3: 673; CHECK-SD: // %bb.0: // %entry 674; CHECK-SD-NEXT: lsr x8, x0, #8 675; CHECK-SD-NEXT: lsr x9, x0, #48 676; CHECK-SD-NEXT: and x10, x8, #0xff000000000000 677; CHECK-SD-NEXT: and x11, x8, #0xff00000000 678; CHECK-SD-NEXT: and x8, x8, #0xff0000 679; CHECK-SD-NEXT: bfi x10, x9, #56, #8 680; CHECK-SD-NEXT: lsr x9, x0, #32 681; CHECK-SD-NEXT: orr x10, x11, x10 682; CHECK-SD-NEXT: bfi x10, x9, #40, #8 683; CHECK-SD-NEXT: lsr x9, x0, #16 684; CHECK-SD-NEXT: orr x8, x8, x10 685; CHECK-SD-NEXT: bfi x8, x9, #24, #8 686; CHECK-SD-NEXT: ubfiz x9, x0, #8, #8 687; CHECK-SD-NEXT: bfxil x8, x0, #8, #8 688; CHECK-SD-NEXT: orr x0, x9, x8 689; CHECK-SD-NEXT: ret 690; 691; CHECK-GI-LABEL: test_rev16_x_hwbyteswaps_complex3: 692; CHECK-GI: // %bb.0: // %entry 693; CHECK-GI-NEXT: lsr x8, x0, #8 694; CHECK-GI-NEXT: lsl x9, x0, #8 695; CHECK-GI-NEXT: and x10, x8, #0xff000000000000 696; CHECK-GI-NEXT: and x11, x9, #0xff00000000000000 697; CHECK-GI-NEXT: and x12, x8, #0xff00000000 698; CHECK-GI-NEXT: and x13, x9, #0xff0000000000 699; CHECK-GI-NEXT: and x14, x8, #0xff0000 700; CHECK-GI-NEXT: orr x10, x11, x10 701; CHECK-GI-NEXT: and x11, x9, #0xff000000 702; CHECK-GI-NEXT: orr x12, x13, x12 703; CHECK-GI-NEXT: and x8, x8, #0xff 704; CHECK-GI-NEXT: orr x11, x11, x14 705; CHECK-GI-NEXT: orr x10, x12, x10 706; CHECK-GI-NEXT: and x9, x9, #0xff00 707; CHECK-GI-NEXT: orr x8, x8, x11 708; CHECK-GI-NEXT: orr x8, x8, x10 709; CHECK-GI-NEXT: orr x0, x9, x8 710; CHECK-GI-NEXT: ret 711entry: 712 %0 = lshr i64 %a, 8 713 %1 = and i64 %0, 71776119061217280 714 %2 = shl i64 %a, 8 715 %3 = and i64 %2, -72057594037927936 716 %4 = or i64 %3, %1 717 %5 = and i64 %0, 1095216660480 718 %6 = or i64 %5, %4 719 %7 = and i64 %2, 280375465082880 720 %8 = or i64 %7, %6 721 %9 = and i64 %0, 16711680 722 %10 = or i64 %9, %8 723 %11 = and i64 %2, 4278190080 724 %12 = or i64 %11, %10 725 %13 = and i64 %0, 255 726 %14 = or i64 %13, %12 727 %15 = and i64 %2, 65280 728 %16 = or i64 %15, %14 729 ret i64 %16 730} 731 732define i64 @test_or_and_combine1(i64 %a) nounwind { 733; CHECK-SD-LABEL: test_or_and_combine1: 734; CHECK-SD: // %bb.0: // %entry 735; CHECK-SD-NEXT: lsr x8, x0, #8 736; CHECK-SD-NEXT: lsr x9, x0, #24 737; CHECK-SD-NEXT: and x10, x8, #0xff000000000000 738; CHECK-SD-NEXT: and x8, x8, #0xff0000 739; CHECK-SD-NEXT: bfi x10, x9, #32, #8 740; CHECK-SD-NEXT: orr x0, x10, x8 741; CHECK-SD-NEXT: ret 742; 743; CHECK-GI-LABEL: test_or_and_combine1: 744; CHECK-GI: // %bb.0: // %entry 745; CHECK-GI-NEXT: lsr x8, x0, #8 746; CHECK-GI-NEXT: lsl x9, x0, #8 747; CHECK-GI-NEXT: and x10, x8, #0xff000000000000 748; CHECK-GI-NEXT: and x9, x9, #0xff00000000 749; CHECK-GI-NEXT: and x8, x8, #0xff0000 750; CHECK-GI-NEXT: orr x9, x10, x9 751; CHECK-GI-NEXT: orr x0, x9, x8 752; CHECK-GI-NEXT: ret 753entry: 754 %0 = lshr i64 %a, 8 755 %1 = and i64 %0, 71776119061217280 756 %2 = shl i64 %a, 8 757 %3 = and i64 %2, 1095216660480 758 %4 = or i64 %1, %3 759 %5 = and i64 %0, 16711680 760 %6 = or i64 %4, %5 761 ret i64 %6 762} 763 764define i64 @test_or_and_combine2(i64 %a, i64 %b) nounwind { 765; CHECK-LABEL: test_or_and_combine2: 766; CHECK: // %bb.0: // %entry 767; CHECK-NEXT: lsr x8, x0, #8 768; CHECK-NEXT: lsl x9, x0, #8 769; CHECK-NEXT: and x10, x8, #0xff000000000000 770; CHECK-NEXT: and x11, x9, #0xff00000000 771; CHECK-NEXT: and x8, x8, #0xff0000 772; CHECK-NEXT: orr x9, x10, x9 773; CHECK-NEXT: orr x8, x11, x8 774; CHECK-NEXT: orr x0, x9, x8 775; CHECK-NEXT: ret 776entry: 777 %0 = lshr i64 %a, 8 778 %1 = and i64 %0, 71776119061217280 779 %2 = shl i64 %a, 8 780 %3 = or i64 %1, %2 781 %4 = and i64 %2, 1095216660480 782 %5 = or i64 %3, %4 783 %6 = and i64 %0, 16711680 784 %7 = or i64 %5, %6 785 ret i64 %7 786} 787 788define i32 @pr55484(i32 %0) { 789; CHECK-SD-LABEL: pr55484: 790; CHECK-SD: // %bb.0: 791; CHECK-SD-NEXT: lsr w8, w0, #8 792; CHECK-SD-NEXT: orr w8, w8, w0, lsl #8 793; CHECK-SD-NEXT: sxth w0, w8 794; CHECK-SD-NEXT: ret 795; 796; CHECK-GI-LABEL: pr55484: 797; CHECK-GI: // %bb.0: 798; CHECK-GI-NEXT: lsl w8, w0, #8 799; CHECK-GI-NEXT: orr w8, w8, w0, lsr #8 800; CHECK-GI-NEXT: sxth w0, w8 801; CHECK-GI-NEXT: ret 802 %2 = lshr i32 %0, 8 803 %3 = shl i32 %0, 8 804 %4 = or i32 %2, %3 805 %5 = trunc i32 %4 to i16 806 %6 = sext i16 %5 to i32 807 ret i32 %6 808} 809