xref: /llvm-project/llvm/test/CodeGen/X86/reassociate-add.ll (revision 0c5b0b50c22d215177f7cdacf533444665ffd864)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 < %s | FileCheck %s
3
4; This file checks the reassociation of ADD instruction.
5; The two ADD instructions add v0,v1,t2 together. t2 has a long dependence
6; chain, v0 and v1 has a short dependence chain, in order to get the shortest
7; latency, v0 and v1 should be added first, and its result is added to t2
8; later.
9
10define void @add8(i8 %x0, i8 %x1, i8 %x2, ptr %p) {
11; CHECK-LABEL: add8:
12; CHECK:       # %bb.0:
13; CHECK-NEXT:    orb $16, %dil
14; CHECK-NEXT:    orb $32, %sil
15; CHECK-NEXT:    addb %dil, %sil
16; CHECK-NEXT:    addb $-8, %dl
17; CHECK-NEXT:    orb $7, %dl
18; CHECK-NEXT:    movzbl %dl, %eax
19; CHECK-NEXT:    imull $100, %eax, %eax
20; CHECK-NEXT:    addb %sil, %al
21; CHECK-NEXT:    movb %al, (%rcx)
22; CHECK-NEXT:    retq
23  %v0 = or i8 %x0, 16
24  %v1 = or i8 %x1, 32
25  %t0 = sub i8 %x2, 8
26  %t1 = or i8 %t0, 7
27  %t2 = mul i8 %t1, 100
28  %t3 = add i8 %t2, %v1
29  %t4 = add i8 %t3, %v0
30  store i8 %t4, ptr %p, align 4
31  ret void
32}
33
34define void @add16(i16 %x0, i16 %x1, i16 %x2, ptr %p) {
35; CHECK-LABEL: add16:
36; CHECK:       # %bb.0:
37; CHECK-NEXT:    orl $16, %edi
38; CHECK-NEXT:    orl $32, %esi
39; CHECK-NEXT:    addl %edi, %esi
40; CHECK-NEXT:    addl $-8, %edx
41; CHECK-NEXT:    orl $7, %edx
42; CHECK-NEXT:    imull $100, %edx, %eax
43; CHECK-NEXT:    addl %esi, %eax
44; CHECK-NEXT:    movw %ax, (%rcx)
45; CHECK-NEXT:    retq
46  %v0 = or i16 %x0, 16
47  %v1 = or i16 %x1, 32
48  %t0 = sub i16 %x2, 8
49  %t1 = or i16 %t0, 7
50  %t2 = mul i16 %t1, 100
51  %t3 = add i16 %t2, %v1
52  %t4 = add i16 %t3, %v0
53  store i16 %t4, ptr %p, align 4
54  ret void
55}
56
57define void @add32(i32 %x0, i32 %x1, i32 %x2, ptr %p) {
58; CHECK-LABEL: add32:
59; CHECK:       # %bb.0:
60; CHECK-NEXT:    orl $16, %edi
61; CHECK-NEXT:    orl $32, %esi
62; CHECK-NEXT:    addl %edi, %esi
63; CHECK-NEXT:    addl $-8, %edx
64; CHECK-NEXT:    orl $7, %edx
65; CHECK-NEXT:    imull $100, %edx, %eax
66; CHECK-NEXT:    addl %esi, %eax
67; CHECK-NEXT:    movl %eax, (%rcx)
68; CHECK-NEXT:    retq
69  %v0 = or i32 %x0, 16
70  %v1 = or i32 %x1, 32
71  %t0 = sub i32 %x2, 8
72  %t1 = or i32 %t0, 7
73  %t2 = mul i32 %t1, 100
74  %t3 = add i32 %t2, %v1
75  %t4 = add i32 %t3, %v0
76  store i32 %t4, ptr %p, align 4
77  ret void
78}
79
80define void @add64(i64 %x0, i64 %x1, i64 %x2, ptr %p) {
81; CHECK-LABEL: add64:
82; CHECK:       # %bb.0:
83; CHECK-NEXT:    orq $16, %rdi
84; CHECK-NEXT:    orq $32, %rsi
85; CHECK-NEXT:    addq %rdi, %rsi
86; CHECK-NEXT:    addq $-8, %rdx
87; CHECK-NEXT:    orq $7, %rdx
88; CHECK-NEXT:    imulq $100, %rdx, %rax
89; CHECK-NEXT:    addq %rsi, %rax
90; CHECK-NEXT:    movq %rax, (%rcx)
91; CHECK-NEXT:    retq
92  %v0 = or i64 %x0, 16
93  %v1 = or i64 %x1, 32
94  %t0 = sub i64 %x2, 8
95  %t1 = or i64 %t0, 7
96  %t2 = mul i64 %t1, 100
97  %t3 = add i64 %t2, %v1
98  %t4 = add i64 %t3, %v0
99  store i64 %t4, ptr %p, align 4
100  ret void
101}
102
103; Negative test. Original sequence has shorter latency, don't transform it.
104define void @add64_negative(i64 %x0, i64 %x1, i64 %x2, ptr %p) {
105; CHECK-LABEL: add64_negative:
106; CHECK:       # %bb.0:
107; CHECK-NEXT:    orq $16, %rdi
108; CHECK-NEXT:    orq $32, %rsi
109; CHECK-NEXT:    addq %rdi, %rsi
110; CHECK-NEXT:    addq $-8, %rdx
111; CHECK-NEXT:    orq $7, %rdx
112; CHECK-NEXT:    imulq $100, %rdx, %rax
113; CHECK-NEXT:    addq %rsi, %rax
114; CHECK-NEXT:    movq %rax, (%rcx)
115; CHECK-NEXT:    retq
116  %v0 = or i64 %x0, 16
117  %v1 = or i64 %x1, 32
118  %t0 = sub i64 %x2, 8
119  %t1 = or i64 %t0, 7
120  %t2 = mul i64 %t1, 100
121  %t3 = add i64 %v0, %v1
122  %t4 = add i64 %t3, %t2
123  store i64 %t4, ptr %p, align 4
124  ret void
125}
126