1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+bmi,+bmi2 < %s | FileCheck %s 3 4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" 5target triple = "x86_64-unknown-unknown" 6 7; Stack reload folding tests. 8; 9; By including a nop call with sideeffects we can force a partial register spill of the 10; relevant registers and check that the reload is correctly folded into the instruction. 11 12define i32 @stack_fold_bzhi_u32(i32 %a0, i32 %a1) { 13; CHECK-LABEL: stack_fold_bzhi_u32: 14; CHECK: # %bb.0: 15; CHECK-NEXT: pushq %rbp 16; CHECK-NEXT: .cfi_def_cfa_offset 16 17; CHECK-NEXT: pushq %r15 18; CHECK-NEXT: .cfi_def_cfa_offset 24 19; CHECK-NEXT: pushq %r14 20; CHECK-NEXT: .cfi_def_cfa_offset 32 21; CHECK-NEXT: pushq %r13 22; CHECK-NEXT: .cfi_def_cfa_offset 40 23; CHECK-NEXT: pushq %r12 24; CHECK-NEXT: .cfi_def_cfa_offset 48 25; CHECK-NEXT: pushq %rbx 26; CHECK-NEXT: .cfi_def_cfa_offset 56 27; CHECK-NEXT: .cfi_offset %rbx, -56 28; CHECK-NEXT: .cfi_offset %r12, -48 29; CHECK-NEXT: .cfi_offset %r13, -40 30; CHECK-NEXT: .cfi_offset %r14, -32 31; CHECK-NEXT: .cfi_offset %r15, -24 32; CHECK-NEXT: .cfi_offset %rbp, -16 33; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 34; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 35; CHECK-NEXT: #APP 36; CHECK-NEXT: nop 37; CHECK-NEXT: #NO_APP 38; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload 39; CHECK-NEXT: bzhil %eax, {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Folded Reload 40; CHECK-NEXT: popq %rbx 41; CHECK-NEXT: .cfi_def_cfa_offset 48 42; CHECK-NEXT: popq %r12 43; CHECK-NEXT: .cfi_def_cfa_offset 40 44; CHECK-NEXT: popq %r13 45; CHECK-NEXT: .cfi_def_cfa_offset 32 46; CHECK-NEXT: popq %r14 47; CHECK-NEXT: .cfi_def_cfa_offset 24 48; CHECK-NEXT: popq %r15 49; CHECK-NEXT: .cfi_def_cfa_offset 16 50; CHECK-NEXT: popq %rbp 51; CHECK-NEXT: .cfi_def_cfa_offset 8 52; CHECK-NEXT: retq 53 %1 = tail call i32 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 54 %2 = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %a0, i32 %a1) 55 ret i32 %2 56} 57declare i32 @llvm.x86.bmi.bzhi.32(i32, i32) 58 59define i64 @stack_fold_bzhi_u64(i64 %a0, i64 %a1) { 60; CHECK-LABEL: stack_fold_bzhi_u64: 61; CHECK: # %bb.0: 62; CHECK-NEXT: pushq %rbp 63; CHECK-NEXT: .cfi_def_cfa_offset 16 64; CHECK-NEXT: pushq %r15 65; CHECK-NEXT: .cfi_def_cfa_offset 24 66; CHECK-NEXT: pushq %r14 67; CHECK-NEXT: .cfi_def_cfa_offset 32 68; CHECK-NEXT: pushq %r13 69; CHECK-NEXT: .cfi_def_cfa_offset 40 70; CHECK-NEXT: pushq %r12 71; CHECK-NEXT: .cfi_def_cfa_offset 48 72; CHECK-NEXT: pushq %rbx 73; CHECK-NEXT: .cfi_def_cfa_offset 56 74; CHECK-NEXT: .cfi_offset %rbx, -56 75; CHECK-NEXT: .cfi_offset %r12, -48 76; CHECK-NEXT: .cfi_offset %r13, -40 77; CHECK-NEXT: .cfi_offset %r14, -32 78; CHECK-NEXT: .cfi_offset %r15, -24 79; CHECK-NEXT: .cfi_offset %rbp, -16 80; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 81; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 82; CHECK-NEXT: #APP 83; CHECK-NEXT: nop 84; CHECK-NEXT: #NO_APP 85; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 86; CHECK-NEXT: bzhiq %rax, {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload 87; CHECK-NEXT: popq %rbx 88; CHECK-NEXT: .cfi_def_cfa_offset 48 89; CHECK-NEXT: popq %r12 90; CHECK-NEXT: .cfi_def_cfa_offset 40 91; CHECK-NEXT: popq %r13 92; CHECK-NEXT: .cfi_def_cfa_offset 32 93; CHECK-NEXT: popq %r14 94; CHECK-NEXT: .cfi_def_cfa_offset 24 95; CHECK-NEXT: popq %r15 96; CHECK-NEXT: .cfi_def_cfa_offset 16 97; CHECK-NEXT: popq %rbp 98; CHECK-NEXT: .cfi_def_cfa_offset 8 99; CHECK-NEXT: retq 100 %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 101 %2 = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %a0, i64 %a1) 102 ret i64 %2 103} 104declare i64 @llvm.x86.bmi.bzhi.64(i64, i64) 105 106define i32 @stack_fold_pdep_u32(i32 %a0, i32 %a1) { 107; CHECK-LABEL: stack_fold_pdep_u32: 108; CHECK: # %bb.0: 109; CHECK-NEXT: pushq %rbp 110; CHECK-NEXT: .cfi_def_cfa_offset 16 111; CHECK-NEXT: pushq %r15 112; CHECK-NEXT: .cfi_def_cfa_offset 24 113; CHECK-NEXT: pushq %r14 114; CHECK-NEXT: .cfi_def_cfa_offset 32 115; CHECK-NEXT: pushq %r13 116; CHECK-NEXT: .cfi_def_cfa_offset 40 117; CHECK-NEXT: pushq %r12 118; CHECK-NEXT: .cfi_def_cfa_offset 48 119; CHECK-NEXT: pushq %rbx 120; CHECK-NEXT: .cfi_def_cfa_offset 56 121; CHECK-NEXT: .cfi_offset %rbx, -56 122; CHECK-NEXT: .cfi_offset %r12, -48 123; CHECK-NEXT: .cfi_offset %r13, -40 124; CHECK-NEXT: .cfi_offset %r14, -32 125; CHECK-NEXT: .cfi_offset %r15, -24 126; CHECK-NEXT: .cfi_offset %rbp, -16 127; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 128; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 129; CHECK-NEXT: #APP 130; CHECK-NEXT: nop 131; CHECK-NEXT: #NO_APP 132; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload 133; CHECK-NEXT: pdepl {{[-0-9]+}}(%r{{[sb]}}p), %eax, %eax # 4-byte Folded Reload 134; CHECK-NEXT: popq %rbx 135; CHECK-NEXT: .cfi_def_cfa_offset 48 136; CHECK-NEXT: popq %r12 137; CHECK-NEXT: .cfi_def_cfa_offset 40 138; CHECK-NEXT: popq %r13 139; CHECK-NEXT: .cfi_def_cfa_offset 32 140; CHECK-NEXT: popq %r14 141; CHECK-NEXT: .cfi_def_cfa_offset 24 142; CHECK-NEXT: popq %r15 143; CHECK-NEXT: .cfi_def_cfa_offset 16 144; CHECK-NEXT: popq %rbp 145; CHECK-NEXT: .cfi_def_cfa_offset 8 146; CHECK-NEXT: retq 147 %1 = tail call i32 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 148 %2 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %a0, i32 %a1) 149 ret i32 %2 150} 151declare i32 @llvm.x86.bmi.pdep.32(i32, i32) 152 153define i64 @stack_fold_pdep_u64(i64 %a0, i64 %a1) { 154; CHECK-LABEL: stack_fold_pdep_u64: 155; CHECK: # %bb.0: 156; CHECK-NEXT: pushq %rbp 157; CHECK-NEXT: .cfi_def_cfa_offset 16 158; CHECK-NEXT: pushq %r15 159; CHECK-NEXT: .cfi_def_cfa_offset 24 160; CHECK-NEXT: pushq %r14 161; CHECK-NEXT: .cfi_def_cfa_offset 32 162; CHECK-NEXT: pushq %r13 163; CHECK-NEXT: .cfi_def_cfa_offset 40 164; CHECK-NEXT: pushq %r12 165; CHECK-NEXT: .cfi_def_cfa_offset 48 166; CHECK-NEXT: pushq %rbx 167; CHECK-NEXT: .cfi_def_cfa_offset 56 168; CHECK-NEXT: .cfi_offset %rbx, -56 169; CHECK-NEXT: .cfi_offset %r12, -48 170; CHECK-NEXT: .cfi_offset %r13, -40 171; CHECK-NEXT: .cfi_offset %r14, -32 172; CHECK-NEXT: .cfi_offset %r15, -24 173; CHECK-NEXT: .cfi_offset %rbp, -16 174; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 175; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 176; CHECK-NEXT: #APP 177; CHECK-NEXT: nop 178; CHECK-NEXT: #NO_APP 179; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 180; CHECK-NEXT: pdepq {{[-0-9]+}}(%r{{[sb]}}p), %rax, %rax # 8-byte Folded Reload 181; CHECK-NEXT: popq %rbx 182; CHECK-NEXT: .cfi_def_cfa_offset 48 183; CHECK-NEXT: popq %r12 184; CHECK-NEXT: .cfi_def_cfa_offset 40 185; CHECK-NEXT: popq %r13 186; CHECK-NEXT: .cfi_def_cfa_offset 32 187; CHECK-NEXT: popq %r14 188; CHECK-NEXT: .cfi_def_cfa_offset 24 189; CHECK-NEXT: popq %r15 190; CHECK-NEXT: .cfi_def_cfa_offset 16 191; CHECK-NEXT: popq %rbp 192; CHECK-NEXT: .cfi_def_cfa_offset 8 193; CHECK-NEXT: retq 194 %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 195 %2 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %a0, i64 %a1) 196 ret i64 %2 197} 198declare i64 @llvm.x86.bmi.pdep.64(i64, i64) 199 200define i32 @stack_fold_pext_u32(i32 %a0, i32 %a1) { 201; CHECK-LABEL: stack_fold_pext_u32: 202; CHECK: # %bb.0: 203; CHECK-NEXT: pushq %rbp 204; CHECK-NEXT: .cfi_def_cfa_offset 16 205; CHECK-NEXT: pushq %r15 206; CHECK-NEXT: .cfi_def_cfa_offset 24 207; CHECK-NEXT: pushq %r14 208; CHECK-NEXT: .cfi_def_cfa_offset 32 209; CHECK-NEXT: pushq %r13 210; CHECK-NEXT: .cfi_def_cfa_offset 40 211; CHECK-NEXT: pushq %r12 212; CHECK-NEXT: .cfi_def_cfa_offset 48 213; CHECK-NEXT: pushq %rbx 214; CHECK-NEXT: .cfi_def_cfa_offset 56 215; CHECK-NEXT: .cfi_offset %rbx, -56 216; CHECK-NEXT: .cfi_offset %r12, -48 217; CHECK-NEXT: .cfi_offset %r13, -40 218; CHECK-NEXT: .cfi_offset %r14, -32 219; CHECK-NEXT: .cfi_offset %r15, -24 220; CHECK-NEXT: .cfi_offset %rbp, -16 221; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 222; CHECK-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 223; CHECK-NEXT: #APP 224; CHECK-NEXT: nop 225; CHECK-NEXT: #NO_APP 226; CHECK-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload 227; CHECK-NEXT: pextl {{[-0-9]+}}(%r{{[sb]}}p), %eax, %eax # 4-byte Folded Reload 228; CHECK-NEXT: popq %rbx 229; CHECK-NEXT: .cfi_def_cfa_offset 48 230; CHECK-NEXT: popq %r12 231; CHECK-NEXT: .cfi_def_cfa_offset 40 232; CHECK-NEXT: popq %r13 233; CHECK-NEXT: .cfi_def_cfa_offset 32 234; CHECK-NEXT: popq %r14 235; CHECK-NEXT: .cfi_def_cfa_offset 24 236; CHECK-NEXT: popq %r15 237; CHECK-NEXT: .cfi_def_cfa_offset 16 238; CHECK-NEXT: popq %rbp 239; CHECK-NEXT: .cfi_def_cfa_offset 8 240; CHECK-NEXT: retq 241 %1 = tail call i32 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 242 %2 = tail call i32 @llvm.x86.bmi.pext.32(i32 %a0, i32 %a1) 243 ret i32 %2 244} 245declare i32 @llvm.x86.bmi.pext.32(i32, i32) 246 247define i64 @stack_fold_pext_u64(i64 %a0, i64 %a1) { 248; CHECK-LABEL: stack_fold_pext_u64: 249; CHECK: # %bb.0: 250; CHECK-NEXT: pushq %rbp 251; CHECK-NEXT: .cfi_def_cfa_offset 16 252; CHECK-NEXT: pushq %r15 253; CHECK-NEXT: .cfi_def_cfa_offset 24 254; CHECK-NEXT: pushq %r14 255; CHECK-NEXT: .cfi_def_cfa_offset 32 256; CHECK-NEXT: pushq %r13 257; CHECK-NEXT: .cfi_def_cfa_offset 40 258; CHECK-NEXT: pushq %r12 259; CHECK-NEXT: .cfi_def_cfa_offset 48 260; CHECK-NEXT: pushq %rbx 261; CHECK-NEXT: .cfi_def_cfa_offset 56 262; CHECK-NEXT: .cfi_offset %rbx, -56 263; CHECK-NEXT: .cfi_offset %r12, -48 264; CHECK-NEXT: .cfi_offset %r13, -40 265; CHECK-NEXT: .cfi_offset %r14, -32 266; CHECK-NEXT: .cfi_offset %r15, -24 267; CHECK-NEXT: .cfi_offset %rbp, -16 268; CHECK-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 269; CHECK-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill 270; CHECK-NEXT: #APP 271; CHECK-NEXT: nop 272; CHECK-NEXT: #NO_APP 273; CHECK-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload 274; CHECK-NEXT: pextq {{[-0-9]+}}(%r{{[sb]}}p), %rax, %rax # 8-byte Folded Reload 275; CHECK-NEXT: popq %rbx 276; CHECK-NEXT: .cfi_def_cfa_offset 48 277; CHECK-NEXT: popq %r12 278; CHECK-NEXT: .cfi_def_cfa_offset 40 279; CHECK-NEXT: popq %r13 280; CHECK-NEXT: .cfi_def_cfa_offset 32 281; CHECK-NEXT: popq %r14 282; CHECK-NEXT: .cfi_def_cfa_offset 24 283; CHECK-NEXT: popq %r15 284; CHECK-NEXT: .cfi_def_cfa_offset 16 285; CHECK-NEXT: popq %rbp 286; CHECK-NEXT: .cfi_def_cfa_offset 8 287; CHECK-NEXT: retq 288 %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() 289 %2 = tail call i64 @llvm.x86.bmi.pext.64(i64 %a0, i64 %a1) 290 ret i64 %2 291} 292declare i64 @llvm.x86.bmi.pext.64(i64, i64) 293