xref: /llvm-project/llvm/test/Transforms/SLPVectorizer/X86/bitreverse.ll (revision 580210a0c938531ef9fd79f9ffedb93eeb2e66c2)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
3; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
4; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
5; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver2 -passes=slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=XOP
6; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=bdver4 -passes=slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=XOP
7
8target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
9
10@src64 = common global [4 x i64] zeroinitializer, align 32
11@dst64 = common global [4 x i64] zeroinitializer, align 32
12@src32 = common global [8 x i32] zeroinitializer, align 32
13@dst32 = common global [8 x i32] zeroinitializer, align 32
14@src16 = common global [16 x i16] zeroinitializer, align 32
15@dst16 = common global [16 x i16] zeroinitializer, align 32
16@src8  = common global [32 x i8] zeroinitializer, align 32
17@dst8  = common global [32 x i8] zeroinitializer, align 32
18
19declare i64 @llvm.bitreverse.i64(i64)
20declare i32 @llvm.bitreverse.i32(i32)
21declare i16 @llvm.bitreverse.i16(i16)
22declare  i8 @llvm.bitreverse.i8(i8)
23
24define void @bitreverse_2i64() #0 {
25; CHECK-LABEL: @bitreverse_2i64(
26; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @src64, align 8
27; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]])
28; CHECK-NEXT:    store <2 x i64> [[TMP2]], ptr @dst64, align 8
29; CHECK-NEXT:    ret void
30;
31  %ld0 = load i64, ptr @src64, align 8
32  %ld1 = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i32 0, i64 1), align 8
33  %bitreverse0 = call i64 @llvm.bitreverse.i64(i64 %ld0)
34  %bitreverse1 = call i64 @llvm.bitreverse.i64(i64 %ld1)
35  store i64 %bitreverse0, ptr @dst64, align 8
36  store i64 %bitreverse1, ptr getelementptr inbounds ([4 x i64], ptr @dst64, i32 0, i64 1), align 8
37  ret void
38}
39
40define void @bitreverse_4i64() #0 {
41; SSE-LABEL: @bitreverse_4i64(
42; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @src64, align 4
43; SSE-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]])
44; SSE-NEXT:    store <2 x i64> [[TMP2]], ptr @dst64, align 4
45; SSE-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 2), align 4
46; SSE-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP3]])
47; SSE-NEXT:    store <2 x i64> [[TMP4]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i64 0, i64 2), align 4
48; SSE-NEXT:    ret void
49;
50; AVX-LABEL: @bitreverse_4i64(
51; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @src64, align 4
52; AVX-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> [[TMP1]])
53; AVX-NEXT:    store <4 x i64> [[TMP2]], ptr @dst64, align 4
54; AVX-NEXT:    ret void
55;
56; XOP-LABEL: @bitreverse_4i64(
57; XOP-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @src64, align 4
58; XOP-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> [[TMP1]])
59; XOP-NEXT:    store <4 x i64> [[TMP2]], ptr @dst64, align 4
60; XOP-NEXT:    ret void
61;
62  %ld0 = load i64, ptr @src64, align 4
63  %ld1 = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 1), align 4
64  %ld2 = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 2), align 4
65  %ld3 = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 3), align 4
66  %bitreverse0 = call i64 @llvm.bitreverse.i64(i64 %ld0)
67  %bitreverse1 = call i64 @llvm.bitreverse.i64(i64 %ld1)
68  %bitreverse2 = call i64 @llvm.bitreverse.i64(i64 %ld2)
69  %bitreverse3 = call i64 @llvm.bitreverse.i64(i64 %ld3)
70  store i64 %bitreverse0, ptr @dst64, align 4
71  store i64 %bitreverse1, ptr getelementptr inbounds ([4 x i64], ptr @dst64, i64 0, i64 1), align 4
72  store i64 %bitreverse2, ptr getelementptr inbounds ([4 x i64], ptr @dst64, i64 0, i64 2), align 4
73  store i64 %bitreverse3, ptr getelementptr inbounds ([4 x i64], ptr @dst64, i64 0, i64 3), align 4
74  ret void
75}
76
77define void @bitreverse_4i32() #0 {
78; CHECK-LABEL: @bitreverse_4i32(
79; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
80; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> [[TMP1]])
81; CHECK-NEXT:    store <4 x i32> [[TMP2]], ptr @dst32, align 4
82; CHECK-NEXT:    ret void
83;
84  %ld0 = load i32, ptr @src32, align 4
85  %ld1 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
86  %ld2 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4
87  %ld3 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4
88  %bitreverse0 = call i32 @llvm.bitreverse.i32(i32 %ld0)
89  %bitreverse1 = call i32 @llvm.bitreverse.i32(i32 %ld1)
90  %bitreverse2 = call i32 @llvm.bitreverse.i32(i32 %ld2)
91  %bitreverse3 = call i32 @llvm.bitreverse.i32(i32 %ld3)
92  store i32 %bitreverse0, ptr @dst32, align 4
93  store i32 %bitreverse1, ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4
94  store i32 %bitreverse2, ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4
95  store i32 %bitreverse3, ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4
96  ret void
97}
98
99define void @bitreverse_8i32() #0 {
100; SSE-LABEL: @bitreverse_8i32(
101; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 2
102; SSE-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> [[TMP1]])
103; SSE-NEXT:    store <4 x i32> [[TMP2]], ptr @dst32, align 2
104; SSE-NEXT:    [[TMP3:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2
105; SSE-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> [[TMP3]])
106; SSE-NEXT:    store <4 x i32> [[TMP4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2
107; SSE-NEXT:    ret void
108;
109; AVX-LABEL: @bitreverse_8i32(
110; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2
111; AVX-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> [[TMP1]])
112; AVX-NEXT:    store <8 x i32> [[TMP2]], ptr @dst32, align 2
113; AVX-NEXT:    ret void
114;
115; XOP-LABEL: @bitreverse_8i32(
116; XOP-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2
117; XOP-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> [[TMP1]])
118; XOP-NEXT:    store <8 x i32> [[TMP2]], ptr @dst32, align 2
119; XOP-NEXT:    ret void
120;
121  %ld0 = load i32, ptr @src32, align 2
122  %ld1 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2
123  %ld2 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 2
124  %ld3 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 2
125  %ld4 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2
126  %ld5 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 5), align 2
127  %ld6 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 6), align 2
128  %ld7 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 7), align 2
129  %bitreverse0 = call i32 @llvm.bitreverse.i32(i32 %ld0)
130  %bitreverse1 = call i32 @llvm.bitreverse.i32(i32 %ld1)
131  %bitreverse2 = call i32 @llvm.bitreverse.i32(i32 %ld2)
132  %bitreverse3 = call i32 @llvm.bitreverse.i32(i32 %ld3)
133  %bitreverse4 = call i32 @llvm.bitreverse.i32(i32 %ld4)
134  %bitreverse5 = call i32 @llvm.bitreverse.i32(i32 %ld5)
135  %bitreverse6 = call i32 @llvm.bitreverse.i32(i32 %ld6)
136  %bitreverse7 = call i32 @llvm.bitreverse.i32(i32 %ld7)
137  store i32 %bitreverse0, ptr @dst32, align 2
138  store i32 %bitreverse1, ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 2
139  store i32 %bitreverse2, ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 2
140  store i32 %bitreverse3, ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 2
141  store i32 %bitreverse4, ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2
142  store i32 %bitreverse5, ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 5), align 2
143  store i32 %bitreverse6, ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 6), align 2
144  store i32 %bitreverse7, ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2
145  ret void
146}
147
148define void @bitreverse_8i16() #0 {
149; CHECK-LABEL: @bitreverse_8i16(
150; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @src16, align 2
151; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[TMP1]])
152; CHECK-NEXT:    store <8 x i16> [[TMP2]], ptr @dst16, align 2
153; CHECK-NEXT:    ret void
154;
155  %ld0 = load i16, ptr @src16, align 2
156  %ld1 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 1), align 2
157  %ld2 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 2), align 2
158  %ld3 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 3), align 2
159  %ld4 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 4), align 2
160  %ld5 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 5), align 2
161  %ld6 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 6), align 2
162  %ld7 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 7), align 2
163  %bitreverse0 = call i16 @llvm.bitreverse.i16(i16 %ld0)
164  %bitreverse1 = call i16 @llvm.bitreverse.i16(i16 %ld1)
165  %bitreverse2 = call i16 @llvm.bitreverse.i16(i16 %ld2)
166  %bitreverse3 = call i16 @llvm.bitreverse.i16(i16 %ld3)
167  %bitreverse4 = call i16 @llvm.bitreverse.i16(i16 %ld4)
168  %bitreverse5 = call i16 @llvm.bitreverse.i16(i16 %ld5)
169  %bitreverse6 = call i16 @llvm.bitreverse.i16(i16 %ld6)
170  %bitreverse7 = call i16 @llvm.bitreverse.i16(i16 %ld7)
171  store i16 %bitreverse0, ptr @dst16, align 2
172  store i16 %bitreverse1, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 1), align 2
173  store i16 %bitreverse2, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 2), align 2
174  store i16 %bitreverse3, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 3), align 2
175  store i16 %bitreverse4, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 4), align 2
176  store i16 %bitreverse5, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 5), align 2
177  store i16 %bitreverse6, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 6), align 2
178  store i16 %bitreverse7, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 7), align 2
179  ret void
180}
181
182define void @bitreverse_16i16() #0 {
183; SSE-LABEL: @bitreverse_16i16(
184; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @src16, align 2
185; SSE-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[TMP1]])
186; SSE-NEXT:    store <8 x i16> [[TMP2]], ptr @dst16, align 2
187; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 8), align 2
188; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[TMP3]])
189; SSE-NEXT:    store <8 x i16> [[TMP4]], ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 8), align 2
190; SSE-NEXT:    ret void
191;
192; AVX-LABEL: @bitreverse_16i16(
193; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @src16, align 2
194; AVX-NEXT:    [[TMP2:%.*]] = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> [[TMP1]])
195; AVX-NEXT:    store <16 x i16> [[TMP2]], ptr @dst16, align 2
196; AVX-NEXT:    ret void
197;
198; XOP-LABEL: @bitreverse_16i16(
199; XOP-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @src16, align 2
200; XOP-NEXT:    [[TMP2:%.*]] = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> [[TMP1]])
201; XOP-NEXT:    store <16 x i16> [[TMP2]], ptr @dst16, align 2
202; XOP-NEXT:    ret void
203;
204  %ld0  = load i16, ptr @src16, align 2
205  %ld1  = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64  1), align 2
206  %ld2  = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64  2), align 2
207  %ld3  = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64  3), align 2
208  %ld4  = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64  4), align 2
209  %ld5  = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64  5), align 2
210  %ld6  = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64  6), align 2
211  %ld7  = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64  7), align 2
212  %ld8  = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64  8), align 2
213  %ld9  = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64  9), align 2
214  %ld10 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 10), align 2
215  %ld11 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 11), align 2
216  %ld12 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 12), align 2
217  %ld13 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 13), align 2
218  %ld14 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 14), align 2
219  %ld15 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 15), align 2
220  %bitreverse0  = call i16 @llvm.bitreverse.i16(i16 %ld0)
221  %bitreverse1  = call i16 @llvm.bitreverse.i16(i16 %ld1)
222  %bitreverse2  = call i16 @llvm.bitreverse.i16(i16 %ld2)
223  %bitreverse3  = call i16 @llvm.bitreverse.i16(i16 %ld3)
224  %bitreverse4  = call i16 @llvm.bitreverse.i16(i16 %ld4)
225  %bitreverse5  = call i16 @llvm.bitreverse.i16(i16 %ld5)
226  %bitreverse6  = call i16 @llvm.bitreverse.i16(i16 %ld6)
227  %bitreverse7  = call i16 @llvm.bitreverse.i16(i16 %ld7)
228  %bitreverse8  = call i16 @llvm.bitreverse.i16(i16 %ld8)
229  %bitreverse9  = call i16 @llvm.bitreverse.i16(i16 %ld9)
230  %bitreverse10 = call i16 @llvm.bitreverse.i16(i16 %ld10)
231  %bitreverse11 = call i16 @llvm.bitreverse.i16(i16 %ld11)
232  %bitreverse12 = call i16 @llvm.bitreverse.i16(i16 %ld12)
233  %bitreverse13 = call i16 @llvm.bitreverse.i16(i16 %ld13)
234  %bitreverse14 = call i16 @llvm.bitreverse.i16(i16 %ld14)
235  %bitreverse15 = call i16 @llvm.bitreverse.i16(i16 %ld15)
236  store i16 %bitreverse0 , ptr @dst16, align 2
237  store i16 %bitreverse1 , ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64  1), align 2
238  store i16 %bitreverse2 , ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64  2), align 2
239  store i16 %bitreverse3 , ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64  3), align 2
240  store i16 %bitreverse4 , ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64  4), align 2
241  store i16 %bitreverse5 , ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64  5), align 2
242  store i16 %bitreverse6 , ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64  6), align 2
243  store i16 %bitreverse7 , ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64  7), align 2
244  store i16 %bitreverse8 , ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64  8), align 2
245  store i16 %bitreverse9 , ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64  9), align 2
246  store i16 %bitreverse10, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 10), align 2
247  store i16 %bitreverse11, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 11), align 2
248  store i16 %bitreverse12, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 12), align 2
249  store i16 %bitreverse13, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 13), align 2
250  store i16 %bitreverse14, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 14), align 2
251  store i16 %bitreverse15, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 15), align 2
252  ret void
253}
254
255define void @bitreverse_16i8() #0 {
256; CHECK-LABEL: @bitreverse_16i8(
257; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @src8, align 1
258; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP1]])
259; CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr @dst8, align 1
260; CHECK-NEXT:    ret void
261;
262  %ld0  = load i8, ptr @src8, align 1
263  %ld1  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  1), align 1
264  %ld2  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  2), align 1
265  %ld3  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  3), align 1
266  %ld4  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  4), align 1
267  %ld5  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  5), align 1
268  %ld6  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  6), align 1
269  %ld7  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  7), align 1
270  %ld8  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  8), align 1
271  %ld9  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  9), align 1
272  %ld10 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 10), align 1
273  %ld11 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 11), align 1
274  %ld12 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 12), align 1
275  %ld13 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 13), align 1
276  %ld14 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 14), align 1
277  %ld15 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 15), align 1
278  %bitreverse0  = call i8 @llvm.bitreverse.i8(i8 %ld0)
279  %bitreverse1  = call i8 @llvm.bitreverse.i8(i8 %ld1)
280  %bitreverse2  = call i8 @llvm.bitreverse.i8(i8 %ld2)
281  %bitreverse3  = call i8 @llvm.bitreverse.i8(i8 %ld3)
282  %bitreverse4  = call i8 @llvm.bitreverse.i8(i8 %ld4)
283  %bitreverse5  = call i8 @llvm.bitreverse.i8(i8 %ld5)
284  %bitreverse6  = call i8 @llvm.bitreverse.i8(i8 %ld6)
285  %bitreverse7  = call i8 @llvm.bitreverse.i8(i8 %ld7)
286  %bitreverse8  = call i8 @llvm.bitreverse.i8(i8 %ld8)
287  %bitreverse9  = call i8 @llvm.bitreverse.i8(i8 %ld9)
288  %bitreverse10 = call i8 @llvm.bitreverse.i8(i8 %ld10)
289  %bitreverse11 = call i8 @llvm.bitreverse.i8(i8 %ld11)
290  %bitreverse12 = call i8 @llvm.bitreverse.i8(i8 %ld12)
291  %bitreverse13 = call i8 @llvm.bitreverse.i8(i8 %ld13)
292  %bitreverse14 = call i8 @llvm.bitreverse.i8(i8 %ld14)
293  %bitreverse15 = call i8 @llvm.bitreverse.i8(i8 %ld15)
294  store i8 %bitreverse0 , ptr @dst8, align 1
295  store i8 %bitreverse1 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  1), align 1
296  store i8 %bitreverse2 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  2), align 1
297  store i8 %bitreverse3 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  3), align 1
298  store i8 %bitreverse4 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  4), align 1
299  store i8 %bitreverse5 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  5), align 1
300  store i8 %bitreverse6 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  6), align 1
301  store i8 %bitreverse7 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  7), align 1
302  store i8 %bitreverse8 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  8), align 1
303  store i8 %bitreverse9 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  9), align 1
304  store i8 %bitreverse10, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 10), align 1
305  store i8 %bitreverse11, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 11), align 1
306  store i8 %bitreverse12, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 12), align 1
307  store i8 %bitreverse13, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 13), align 1
308  store i8 %bitreverse14, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 14), align 1
309  store i8 %bitreverse15, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 15), align 1
310  ret void
311}
312
313define void @bitreverse_32i8() #0 {
314; SSE-LABEL: @bitreverse_32i8(
315; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @src8, align 1
316; SSE-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP1]])
317; SSE-NEXT:    store <16 x i8> [[TMP2]], ptr @dst8, align 1
318; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 16), align 1
319; SSE-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP3]])
320; SSE-NEXT:    store <16 x i8> [[TMP4]], ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 16), align 1
321; SSE-NEXT:    ret void
322;
323; AVX-LABEL: @bitreverse_32i8(
324; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @src8, align 1
325; AVX-NEXT:    [[TMP2:%.*]] = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> [[TMP1]])
326; AVX-NEXT:    store <32 x i8> [[TMP2]], ptr @dst8, align 1
327; AVX-NEXT:    ret void
328;
329; XOP-LABEL: @bitreverse_32i8(
330; XOP-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @src8, align 1
331; XOP-NEXT:    [[TMP2:%.*]] = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> [[TMP1]])
332; XOP-NEXT:    store <32 x i8> [[TMP2]], ptr @dst8, align 1
333; XOP-NEXT:    ret void
334;
335  %ld0  = load i8, ptr @src8, align 1
336  %ld1  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  1), align 1
337  %ld2  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  2), align 1
338  %ld3  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  3), align 1
339  %ld4  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  4), align 1
340  %ld5  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  5), align 1
341  %ld6  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  6), align 1
342  %ld7  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  7), align 1
343  %ld8  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  8), align 1
344  %ld9  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  9), align 1
345  %ld10 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 10), align 1
346  %ld11 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 11), align 1
347  %ld12 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 12), align 1
348  %ld13 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 13), align 1
349  %ld14 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 14), align 1
350  %ld15 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 15), align 1
351  %ld16 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 16), align 1
352  %ld17 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 17), align 1
353  %ld18 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 18), align 1
354  %ld19 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 19), align 1
355  %ld20 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 20), align 1
356  %ld21 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 21), align 1
357  %ld22 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 22), align 1
358  %ld23 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 23), align 1
359  %ld24 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 24), align 1
360  %ld25 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 25), align 1
361  %ld26 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 26), align 1
362  %ld27 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 27), align 1
363  %ld28 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 28), align 1
364  %ld29 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 29), align 1
365  %ld30 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 30), align 1
366  %ld31 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 31), align 1
367  %bitreverse0  = call i8 @llvm.bitreverse.i8(i8 %ld0)
368  %bitreverse1  = call i8 @llvm.bitreverse.i8(i8 %ld1)
369  %bitreverse2  = call i8 @llvm.bitreverse.i8(i8 %ld2)
370  %bitreverse3  = call i8 @llvm.bitreverse.i8(i8 %ld3)
371  %bitreverse4  = call i8 @llvm.bitreverse.i8(i8 %ld4)
372  %bitreverse5  = call i8 @llvm.bitreverse.i8(i8 %ld5)
373  %bitreverse6  = call i8 @llvm.bitreverse.i8(i8 %ld6)
374  %bitreverse7  = call i8 @llvm.bitreverse.i8(i8 %ld7)
375  %bitreverse8  = call i8 @llvm.bitreverse.i8(i8 %ld8)
376  %bitreverse9  = call i8 @llvm.bitreverse.i8(i8 %ld9)
377  %bitreverse10 = call i8 @llvm.bitreverse.i8(i8 %ld10)
378  %bitreverse11 = call i8 @llvm.bitreverse.i8(i8 %ld11)
379  %bitreverse12 = call i8 @llvm.bitreverse.i8(i8 %ld12)
380  %bitreverse13 = call i8 @llvm.bitreverse.i8(i8 %ld13)
381  %bitreverse14 = call i8 @llvm.bitreverse.i8(i8 %ld14)
382  %bitreverse15 = call i8 @llvm.bitreverse.i8(i8 %ld15)
383  %bitreverse16 = call i8 @llvm.bitreverse.i8(i8 %ld16)
384  %bitreverse17 = call i8 @llvm.bitreverse.i8(i8 %ld17)
385  %bitreverse18 = call i8 @llvm.bitreverse.i8(i8 %ld18)
386  %bitreverse19 = call i8 @llvm.bitreverse.i8(i8 %ld19)
387  %bitreverse20 = call i8 @llvm.bitreverse.i8(i8 %ld20)
388  %bitreverse21 = call i8 @llvm.bitreverse.i8(i8 %ld21)
389  %bitreverse22 = call i8 @llvm.bitreverse.i8(i8 %ld22)
390  %bitreverse23 = call i8 @llvm.bitreverse.i8(i8 %ld23)
391  %bitreverse24 = call i8 @llvm.bitreverse.i8(i8 %ld24)
392  %bitreverse25 = call i8 @llvm.bitreverse.i8(i8 %ld25)
393  %bitreverse26 = call i8 @llvm.bitreverse.i8(i8 %ld26)
394  %bitreverse27 = call i8 @llvm.bitreverse.i8(i8 %ld27)
395  %bitreverse28 = call i8 @llvm.bitreverse.i8(i8 %ld28)
396  %bitreverse29 = call i8 @llvm.bitreverse.i8(i8 %ld29)
397  %bitreverse30 = call i8 @llvm.bitreverse.i8(i8 %ld30)
398  %bitreverse31 = call i8 @llvm.bitreverse.i8(i8 %ld31)
399  store i8 %bitreverse0 , ptr @dst8, align 1
400  store i8 %bitreverse1 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  1), align 1
401  store i8 %bitreverse2 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  2), align 1
402  store i8 %bitreverse3 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  3), align 1
403  store i8 %bitreverse4 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  4), align 1
404  store i8 %bitreverse5 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  5), align 1
405  store i8 %bitreverse6 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  6), align 1
406  store i8 %bitreverse7 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  7), align 1
407  store i8 %bitreverse8 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  8), align 1
408  store i8 %bitreverse9 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  9), align 1
409  store i8 %bitreverse10, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 10), align 1
410  store i8 %bitreverse11, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 11), align 1
411  store i8 %bitreverse12, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 12), align 1
412  store i8 %bitreverse13, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 13), align 1
413  store i8 %bitreverse14, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 14), align 1
414  store i8 %bitreverse15, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 15), align 1
415  store i8 %bitreverse16, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 16), align 1
416  store i8 %bitreverse17, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 17), align 1
417  store i8 %bitreverse18, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 18), align 1
418  store i8 %bitreverse19, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 19), align 1
419  store i8 %bitreverse20, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 20), align 1
420  store i8 %bitreverse21, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 21), align 1
421  store i8 %bitreverse22, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 22), align 1
422  store i8 %bitreverse23, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 23), align 1
423  store i8 %bitreverse24, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 24), align 1
424  store i8 %bitreverse25, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 25), align 1
425  store i8 %bitreverse26, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 26), align 1
426  store i8 %bitreverse27, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 27), align 1
427  store i8 %bitreverse28, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 28), align 1
428  store i8 %bitreverse29, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 29), align 1
429  store i8 %bitreverse30, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 30), align 1
430  store i8 %bitreverse31, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 31), align 1
431  ret void
432}
433
434attributes #0 = { nounwind }
435
436