xref: /llvm-project/llvm/test/Transforms/SLPVectorizer/X86/ctlz.ll (revision dec47b76f406242dfb9d36da4d7adfb171c71104)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
3; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,SSE
4; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
5; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
6; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=icelake-server -passes=slp-vectorizer -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
7
8target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
9
10@src64 = common global [4 x i64] zeroinitializer, align 32
11@dst64 = common global [4 x i64] zeroinitializer, align 32
12@src32 = common global [8 x i32] zeroinitializer, align 32
13@dst32 = common global [8 x i32] zeroinitializer, align 32
14@src16 = common global [16 x i16] zeroinitializer, align 32
15@dst16 = common global [16 x i16] zeroinitializer, align 32
16@src8  = common global [32 x i8] zeroinitializer, align 32
17@dst8  = common global [32 x i8] zeroinitializer, align 32
18
19declare i64 @llvm.ctlz.i64(i64, i1)
20declare i32 @llvm.ctlz.i32(i32, i1)
21declare i16 @llvm.ctlz.i16(i16, i1)
22declare  i8 @llvm.ctlz.i8(i8, i1)
23
24;
25; CTLZ
26;
27
28define void @ctlz_2i64() #0 {
29; SSE-LABEL: @ctlz_2i64(
30; SSE-NEXT:    [[LD0:%.*]] = load i64, ptr @src64, align 8
31; SSE-NEXT:    [[LD1:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i32 0, i64 1), align 8
32; SSE-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 false)
33; SSE-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 false)
34; SSE-NEXT:    store i64 [[CTLZ0]], ptr @dst64, align 8
35; SSE-NEXT:    store i64 [[CTLZ1]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i32 0, i64 1), align 8
36; SSE-NEXT:    ret void
37;
38; AVX1-LABEL: @ctlz_2i64(
39; AVX1-NEXT:    [[LD0:%.*]] = load i64, ptr @src64, align 8
40; AVX1-NEXT:    [[LD1:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i32 0, i64 1), align 8
41; AVX1-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 false)
42; AVX1-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 false)
43; AVX1-NEXT:    store i64 [[CTLZ0]], ptr @dst64, align 8
44; AVX1-NEXT:    store i64 [[CTLZ1]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i32 0, i64 1), align 8
45; AVX1-NEXT:    ret void
46;
47; AVX2-LABEL: @ctlz_2i64(
48; AVX2-NEXT:    [[LD0:%.*]] = load i64, ptr @src64, align 8
49; AVX2-NEXT:    [[LD1:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i32 0, i64 1), align 8
50; AVX2-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 false)
51; AVX2-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 false)
52; AVX2-NEXT:    store i64 [[CTLZ0]], ptr @dst64, align 8
53; AVX2-NEXT:    store i64 [[CTLZ1]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i32 0, i64 1), align 8
54; AVX2-NEXT:    ret void
55;
56; AVX512-LABEL: @ctlz_2i64(
57; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @src64, align 8
58; AVX512-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> [[TMP1]], i1 false)
59; AVX512-NEXT:    store <2 x i64> [[TMP2]], ptr @dst64, align 8
60; AVX512-NEXT:    ret void
61;
62  %ld0 = load i64, ptr @src64, align 8
63  %ld1 = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i32 0, i64 1), align 8
64  %ctlz0 = call i64 @llvm.ctlz.i64(i64 %ld0, i1 0)
65  %ctlz1 = call i64 @llvm.ctlz.i64(i64 %ld1, i1 0)
66  store i64 %ctlz0, ptr @dst64, align 8
67  store i64 %ctlz1, ptr getelementptr inbounds ([4 x i64], ptr @dst64, i32 0, i64 1), align 8
68  ret void
69}
70
71define void @ctlz_4i64() #0 {
72; SSE-LABEL: @ctlz_4i64(
73; SSE-NEXT:    [[LD0:%.*]] = load i64, ptr @src64, align 4
74; SSE-NEXT:    [[LD1:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 1), align 4
75; SSE-NEXT:    [[LD2:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 2), align 4
76; SSE-NEXT:    [[LD3:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 3), align 4
77; SSE-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 false)
78; SSE-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 false)
79; SSE-NEXT:    [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 false)
80; SSE-NEXT:    [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 false)
81; SSE-NEXT:    store i64 [[CTLZ0]], ptr @dst64, align 4
82; SSE-NEXT:    store i64 [[CTLZ1]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i64 0, i64 1), align 4
83; SSE-NEXT:    store i64 [[CTLZ2]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i64 0, i64 2), align 4
84; SSE-NEXT:    store i64 [[CTLZ3]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i64 0, i64 3), align 4
85; SSE-NEXT:    ret void
86;
87; AVX1-LABEL: @ctlz_4i64(
88; AVX1-NEXT:    [[LD0:%.*]] = load i64, ptr @src64, align 4
89; AVX1-NEXT:    [[LD1:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 1), align 4
90; AVX1-NEXT:    [[LD2:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 2), align 4
91; AVX1-NEXT:    [[LD3:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 3), align 4
92; AVX1-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 false)
93; AVX1-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 false)
94; AVX1-NEXT:    [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 false)
95; AVX1-NEXT:    [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 false)
96; AVX1-NEXT:    store i64 [[CTLZ0]], ptr @dst64, align 4
97; AVX1-NEXT:    store i64 [[CTLZ1]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i64 0, i64 1), align 4
98; AVX1-NEXT:    store i64 [[CTLZ2]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i64 0, i64 2), align 4
99; AVX1-NEXT:    store i64 [[CTLZ3]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i64 0, i64 3), align 4
100; AVX1-NEXT:    ret void
101;
102; AVX2-LABEL: @ctlz_4i64(
103; AVX2-NEXT:    [[LD0:%.*]] = load i64, ptr @src64, align 4
104; AVX2-NEXT:    [[LD1:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 1), align 4
105; AVX2-NEXT:    [[LD2:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 2), align 4
106; AVX2-NEXT:    [[LD3:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 3), align 4
107; AVX2-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 false)
108; AVX2-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 false)
109; AVX2-NEXT:    [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 false)
110; AVX2-NEXT:    [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 false)
111; AVX2-NEXT:    store i64 [[CTLZ0]], ptr @dst64, align 4
112; AVX2-NEXT:    store i64 [[CTLZ1]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i64 0, i64 1), align 4
113; AVX2-NEXT:    store i64 [[CTLZ2]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i64 0, i64 2), align 4
114; AVX2-NEXT:    store i64 [[CTLZ3]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i64 0, i64 3), align 4
115; AVX2-NEXT:    ret void
116;
117; AVX512-LABEL: @ctlz_4i64(
118; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @src64, align 4
119; AVX512-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> [[TMP1]], i1 false)
120; AVX512-NEXT:    store <4 x i64> [[TMP2]], ptr @dst64, align 4
121; AVX512-NEXT:    ret void
122;
123  %ld0 = load i64, ptr @src64, align 4
124  %ld1 = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 1), align 4
125  %ld2 = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 2), align 4
126  %ld3 = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 3), align 4
127  %ctlz0 = call i64 @llvm.ctlz.i64(i64 %ld0, i1 0)
128  %ctlz1 = call i64 @llvm.ctlz.i64(i64 %ld1, i1 0)
129  %ctlz2 = call i64 @llvm.ctlz.i64(i64 %ld2, i1 0)
130  %ctlz3 = call i64 @llvm.ctlz.i64(i64 %ld3, i1 0)
131  store i64 %ctlz0, ptr @dst64, align 4
132  store i64 %ctlz1, ptr getelementptr inbounds ([4 x i64], ptr @dst64, i64 0, i64 1), align 4
133  store i64 %ctlz2, ptr getelementptr inbounds ([4 x i64], ptr @dst64, i64 0, i64 2), align 4
134  store i64 %ctlz3, ptr getelementptr inbounds ([4 x i64], ptr @dst64, i64 0, i64 3), align 4
135  ret void
136}
137
138define void @ctlz_4i32() #0 {
139; SSE-LABEL: @ctlz_4i32(
140; SSE-NEXT:    [[LD0:%.*]] = load i32, ptr @src32, align 4
141; SSE-NEXT:    [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
142; SSE-NEXT:    [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4
143; SSE-NEXT:    [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4
144; SSE-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
145; SSE-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
146; SSE-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
147; SSE-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
148; SSE-NEXT:    store i32 [[CTLZ0]], ptr @dst32, align 4
149; SSE-NEXT:    store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4
150; SSE-NEXT:    store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4
151; SSE-NEXT:    store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4
152; SSE-NEXT:    ret void
153;
154; AVX1-LABEL: @ctlz_4i32(
155; AVX1-NEXT:    [[LD0:%.*]] = load i32, ptr @src32, align 4
156; AVX1-NEXT:    [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
157; AVX1-NEXT:    [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4
158; AVX1-NEXT:    [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4
159; AVX1-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
160; AVX1-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
161; AVX1-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
162; AVX1-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
163; AVX1-NEXT:    store i32 [[CTLZ0]], ptr @dst32, align 4
164; AVX1-NEXT:    store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4
165; AVX1-NEXT:    store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4
166; AVX1-NEXT:    store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4
167; AVX1-NEXT:    ret void
168;
169; AVX2-LABEL: @ctlz_4i32(
170; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
171; AVX2-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
172; AVX2-NEXT:    store <4 x i32> [[TMP2]], ptr @dst32, align 4
173; AVX2-NEXT:    ret void
174;
175; AVX512-LABEL: @ctlz_4i32(
176; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
177; AVX512-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 false)
178; AVX512-NEXT:    store <4 x i32> [[TMP2]], ptr @dst32, align 4
179; AVX512-NEXT:    ret void
180;
181  %ld0 = load i32, ptr @src32, align 4
182  %ld1 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
183  %ld2 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4
184  %ld3 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4
185  %ctlz0 = call i32 @llvm.ctlz.i32(i32 %ld0, i1 0)
186  %ctlz1 = call i32 @llvm.ctlz.i32(i32 %ld1, i1 0)
187  %ctlz2 = call i32 @llvm.ctlz.i32(i32 %ld2, i1 0)
188  %ctlz3 = call i32 @llvm.ctlz.i32(i32 %ld3, i1 0)
189  store i32 %ctlz0, ptr @dst32, align 4
190  store i32 %ctlz1, ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4
191  store i32 %ctlz2, ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4
192  store i32 %ctlz3, ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4
193  ret void
194}
195
196define void @ctlz_8i32() #0 {
197; SSE-LABEL: @ctlz_8i32(
198; SSE-NEXT:    [[LD0:%.*]] = load i32, ptr @src32, align 2
199; SSE-NEXT:    [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2
200; SSE-NEXT:    [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 2
201; SSE-NEXT:    [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 2
202; SSE-NEXT:    [[LD4:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2
203; SSE-NEXT:    [[LD5:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 5), align 2
204; SSE-NEXT:    [[LD6:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 6), align 2
205; SSE-NEXT:    [[LD7:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 7), align 2
206; SSE-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
207; SSE-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
208; SSE-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
209; SSE-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
210; SSE-NEXT:    [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 false)
211; SSE-NEXT:    [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 false)
212; SSE-NEXT:    [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 false)
213; SSE-NEXT:    [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 false)
214; SSE-NEXT:    store i32 [[CTLZ0]], ptr @dst32, align 2
215; SSE-NEXT:    store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 2
216; SSE-NEXT:    store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 2
217; SSE-NEXT:    store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 2
218; SSE-NEXT:    store i32 [[CTLZ4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2
219; SSE-NEXT:    store i32 [[CTLZ5]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 5), align 2
220; SSE-NEXT:    store i32 [[CTLZ6]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 6), align 2
221; SSE-NEXT:    store i32 [[CTLZ7]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2
222; SSE-NEXT:    ret void
223;
224; AVX1-LABEL: @ctlz_8i32(
225; AVX1-NEXT:    [[LD0:%.*]] = load i32, ptr @src32, align 2
226; AVX1-NEXT:    [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2
227; AVX1-NEXT:    [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 2
228; AVX1-NEXT:    [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 2
229; AVX1-NEXT:    [[LD4:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2
230; AVX1-NEXT:    [[LD5:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 5), align 2
231; AVX1-NEXT:    [[LD6:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 6), align 2
232; AVX1-NEXT:    [[LD7:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 7), align 2
233; AVX1-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 false)
234; AVX1-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 false)
235; AVX1-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 false)
236; AVX1-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 false)
237; AVX1-NEXT:    [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 false)
238; AVX1-NEXT:    [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 false)
239; AVX1-NEXT:    [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 false)
240; AVX1-NEXT:    [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 false)
241; AVX1-NEXT:    store i32 [[CTLZ0]], ptr @dst32, align 2
242; AVX1-NEXT:    store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 2
243; AVX1-NEXT:    store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 2
244; AVX1-NEXT:    store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 2
245; AVX1-NEXT:    store i32 [[CTLZ4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2
246; AVX1-NEXT:    store i32 [[CTLZ5]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 5), align 2
247; AVX1-NEXT:    store i32 [[CTLZ6]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 6), align 2
248; AVX1-NEXT:    store i32 [[CTLZ7]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2
249; AVX1-NEXT:    ret void
250;
251; AVX2-LABEL: @ctlz_8i32(
252; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2
253; AVX2-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> [[TMP1]], i1 false)
254; AVX2-NEXT:    store <8 x i32> [[TMP2]], ptr @dst32, align 2
255; AVX2-NEXT:    ret void
256;
257; AVX512-LABEL: @ctlz_8i32(
258; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2
259; AVX512-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> [[TMP1]], i1 false)
260; AVX512-NEXT:    store <8 x i32> [[TMP2]], ptr @dst32, align 2
261; AVX512-NEXT:    ret void
262;
263  %ld0 = load i32, ptr @src32, align 2
264  %ld1 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2
265  %ld2 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 2
266  %ld3 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 2
267  %ld4 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2
268  %ld5 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 5), align 2
269  %ld6 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 6), align 2
270  %ld7 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 7), align 2
271  %ctlz0 = call i32 @llvm.ctlz.i32(i32 %ld0, i1 0)
272  %ctlz1 = call i32 @llvm.ctlz.i32(i32 %ld1, i1 0)
273  %ctlz2 = call i32 @llvm.ctlz.i32(i32 %ld2, i1 0)
274  %ctlz3 = call i32 @llvm.ctlz.i32(i32 %ld3, i1 0)
275  %ctlz4 = call i32 @llvm.ctlz.i32(i32 %ld4, i1 0)
276  %ctlz5 = call i32 @llvm.ctlz.i32(i32 %ld5, i1 0)
277  %ctlz6 = call i32 @llvm.ctlz.i32(i32 %ld6, i1 0)
278  %ctlz7 = call i32 @llvm.ctlz.i32(i32 %ld7, i1 0)
279  store i32 %ctlz0, ptr @dst32, align 2
280  store i32 %ctlz1, ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 2
281  store i32 %ctlz2, ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 2
282  store i32 %ctlz3, ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 2
283  store i32 %ctlz4, ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2
284  store i32 %ctlz5, ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 5), align 2
285  store i32 %ctlz6, ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 6), align 2
286  store i32 %ctlz7, ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2
287  ret void
288}
289
290define void @ctlz_8i16() #0 {
291; CHECK-LABEL: @ctlz_8i16(
292; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @src16, align 2
293; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 false)
294; CHECK-NEXT:    store <8 x i16> [[TMP2]], ptr @dst16, align 2
295; CHECK-NEXT:    ret void
296;
297  %ld0 = load i16, ptr @src16, align 2
298  %ld1 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 1), align 2
299  %ld2 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 2), align 2
300  %ld3 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 3), align 2
301  %ld4 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 4), align 2
302  %ld5 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 5), align 2
303  %ld6 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 6), align 2
304  %ld7 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 7), align 2
305  %ctlz0 = call i16 @llvm.ctlz.i16(i16 %ld0, i1 0)
306  %ctlz1 = call i16 @llvm.ctlz.i16(i16 %ld1, i1 0)
307  %ctlz2 = call i16 @llvm.ctlz.i16(i16 %ld2, i1 0)
308  %ctlz3 = call i16 @llvm.ctlz.i16(i16 %ld3, i1 0)
309  %ctlz4 = call i16 @llvm.ctlz.i16(i16 %ld4, i1 0)
310  %ctlz5 = call i16 @llvm.ctlz.i16(i16 %ld5, i1 0)
311  %ctlz6 = call i16 @llvm.ctlz.i16(i16 %ld6, i1 0)
312  %ctlz7 = call i16 @llvm.ctlz.i16(i16 %ld7, i1 0)
313  store i16 %ctlz0, ptr @dst16, align 2
314  store i16 %ctlz1, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 1), align 2
315  store i16 %ctlz2, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 2), align 2
316  store i16 %ctlz3, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 3), align 2
317  store i16 %ctlz4, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 4), align 2
318  store i16 %ctlz5, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 5), align 2
319  store i16 %ctlz6, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 6), align 2
320  store i16 %ctlz7, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 7), align 2
321  ret void
322}
323
324define void @ctlz_16i16() #0 {
325; SSE-LABEL: @ctlz_16i16(
326; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @src16, align 2
327; SSE-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 false)
328; SSE-NEXT:    store <8 x i16> [[TMP2]], ptr @dst16, align 2
329; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 8), align 2
330; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP3]], i1 false)
331; SSE-NEXT:    store <8 x i16> [[TMP4]], ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 8), align 2
332; SSE-NEXT:    ret void
333;
334; AVX-LABEL: @ctlz_16i16(
335; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @src16, align 2
336; AVX-NEXT:    [[TMP2:%.*]] = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> [[TMP1]], i1 false)
337; AVX-NEXT:    store <16 x i16> [[TMP2]], ptr @dst16, align 2
338; AVX-NEXT:    ret void
339;
340  %ld0  = load i16, ptr @src16, align 2
341  %ld1  = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64  1), align 2
342  %ld2  = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64  2), align 2
343  %ld3  = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64  3), align 2
344  %ld4  = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64  4), align 2
345  %ld5  = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64  5), align 2
346  %ld6  = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64  6), align 2
347  %ld7  = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64  7), align 2
348  %ld8  = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64  8), align 2
349  %ld9  = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64  9), align 2
350  %ld10 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 10), align 2
351  %ld11 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 11), align 2
352  %ld12 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 12), align 2
353  %ld13 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 13), align 2
354  %ld14 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 14), align 2
355  %ld15 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 15), align 2
356  %ctlz0  = call i16 @llvm.ctlz.i16(i16 %ld0, i1 0)
357  %ctlz1  = call i16 @llvm.ctlz.i16(i16 %ld1, i1 0)
358  %ctlz2  = call i16 @llvm.ctlz.i16(i16 %ld2, i1 0)
359  %ctlz3  = call i16 @llvm.ctlz.i16(i16 %ld3, i1 0)
360  %ctlz4  = call i16 @llvm.ctlz.i16(i16 %ld4, i1 0)
361  %ctlz5  = call i16 @llvm.ctlz.i16(i16 %ld5, i1 0)
362  %ctlz6  = call i16 @llvm.ctlz.i16(i16 %ld6, i1 0)
363  %ctlz7  = call i16 @llvm.ctlz.i16(i16 %ld7, i1 0)
364  %ctlz8  = call i16 @llvm.ctlz.i16(i16 %ld8, i1 0)
365  %ctlz9  = call i16 @llvm.ctlz.i16(i16 %ld9, i1 0)
366  %ctlz10 = call i16 @llvm.ctlz.i16(i16 %ld10, i1 0)
367  %ctlz11 = call i16 @llvm.ctlz.i16(i16 %ld11, i1 0)
368  %ctlz12 = call i16 @llvm.ctlz.i16(i16 %ld12, i1 0)
369  %ctlz13 = call i16 @llvm.ctlz.i16(i16 %ld13, i1 0)
370  %ctlz14 = call i16 @llvm.ctlz.i16(i16 %ld14, i1 0)
371  %ctlz15 = call i16 @llvm.ctlz.i16(i16 %ld15, i1 0)
372  store i16 %ctlz0 , ptr @dst16, align 2
373  store i16 %ctlz1 , ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64  1), align 2
374  store i16 %ctlz2 , ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64  2), align 2
375  store i16 %ctlz3 , ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64  3), align 2
376  store i16 %ctlz4 , ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64  4), align 2
377  store i16 %ctlz5 , ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64  5), align 2
378  store i16 %ctlz6 , ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64  6), align 2
379  store i16 %ctlz7 , ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64  7), align 2
380  store i16 %ctlz8 , ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64  8), align 2
381  store i16 %ctlz9 , ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64  9), align 2
382  store i16 %ctlz10, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 10), align 2
383  store i16 %ctlz11, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 11), align 2
384  store i16 %ctlz12, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 12), align 2
385  store i16 %ctlz13, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 13), align 2
386  store i16 %ctlz14, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 14), align 2
387  store i16 %ctlz15, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 15), align 2
388  ret void
389}
390
391define void @ctlz_16i8() #0 {
392; CHECK-LABEL: @ctlz_16i8(
393; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @src8, align 1
394; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 false)
395; CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr @dst8, align 1
396; CHECK-NEXT:    ret void
397;
398  %ld0  = load i8, ptr @src8, align 1
399  %ld1  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  1), align 1
400  %ld2  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  2), align 1
401  %ld3  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  3), align 1
402  %ld4  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  4), align 1
403  %ld5  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  5), align 1
404  %ld6  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  6), align 1
405  %ld7  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  7), align 1
406  %ld8  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  8), align 1
407  %ld9  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  9), align 1
408  %ld10 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 10), align 1
409  %ld11 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 11), align 1
410  %ld12 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 12), align 1
411  %ld13 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 13), align 1
412  %ld14 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 14), align 1
413  %ld15 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 15), align 1
414  %ctlz0  = call i8 @llvm.ctlz.i8(i8 %ld0, i1 0)
415  %ctlz1  = call i8 @llvm.ctlz.i8(i8 %ld1, i1 0)
416  %ctlz2  = call i8 @llvm.ctlz.i8(i8 %ld2, i1 0)
417  %ctlz3  = call i8 @llvm.ctlz.i8(i8 %ld3, i1 0)
418  %ctlz4  = call i8 @llvm.ctlz.i8(i8 %ld4, i1 0)
419  %ctlz5  = call i8 @llvm.ctlz.i8(i8 %ld5, i1 0)
420  %ctlz6  = call i8 @llvm.ctlz.i8(i8 %ld6, i1 0)
421  %ctlz7  = call i8 @llvm.ctlz.i8(i8 %ld7, i1 0)
422  %ctlz8  = call i8 @llvm.ctlz.i8(i8 %ld8, i1 0)
423  %ctlz9  = call i8 @llvm.ctlz.i8(i8 %ld9, i1 0)
424  %ctlz10 = call i8 @llvm.ctlz.i8(i8 %ld10, i1 0)
425  %ctlz11 = call i8 @llvm.ctlz.i8(i8 %ld11, i1 0)
426  %ctlz12 = call i8 @llvm.ctlz.i8(i8 %ld12, i1 0)
427  %ctlz13 = call i8 @llvm.ctlz.i8(i8 %ld13, i1 0)
428  %ctlz14 = call i8 @llvm.ctlz.i8(i8 %ld14, i1 0)
429  %ctlz15 = call i8 @llvm.ctlz.i8(i8 %ld15, i1 0)
430  store i8 %ctlz0 , ptr @dst8, align 1
431  store i8 %ctlz1 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  1), align 1
432  store i8 %ctlz2 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  2), align 1
433  store i8 %ctlz3 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  3), align 1
434  store i8 %ctlz4 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  4), align 1
435  store i8 %ctlz5 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  5), align 1
436  store i8 %ctlz6 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  6), align 1
437  store i8 %ctlz7 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  7), align 1
438  store i8 %ctlz8 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  8), align 1
439  store i8 %ctlz9 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  9), align 1
440  store i8 %ctlz10, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 10), align 1
441  store i8 %ctlz11, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 11), align 1
442  store i8 %ctlz12, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 12), align 1
443  store i8 %ctlz13, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 13), align 1
444  store i8 %ctlz14, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 14), align 1
445  store i8 %ctlz15, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 15), align 1
446  ret void
447}
448
449define void @ctlz_32i8() #0 {
450; SSE-LABEL: @ctlz_32i8(
451; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @src8, align 1
452; SSE-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 false)
453; SSE-NEXT:    store <16 x i8> [[TMP2]], ptr @dst8, align 1
454; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 16), align 1
455; SSE-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP3]], i1 false)
456; SSE-NEXT:    store <16 x i8> [[TMP4]], ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 16), align 1
457; SSE-NEXT:    ret void
458;
459; AVX-LABEL: @ctlz_32i8(
460; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @src8, align 1
461; AVX-NEXT:    [[TMP2:%.*]] = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> [[TMP1]], i1 false)
462; AVX-NEXT:    store <32 x i8> [[TMP2]], ptr @dst8, align 1
463; AVX-NEXT:    ret void
464;
465  %ld0  = load i8, ptr @src8, align 1
466  %ld1  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  1), align 1
467  %ld2  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  2), align 1
468  %ld3  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  3), align 1
469  %ld4  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  4), align 1
470  %ld5  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  5), align 1
471  %ld6  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  6), align 1
472  %ld7  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  7), align 1
473  %ld8  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  8), align 1
474  %ld9  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  9), align 1
475  %ld10 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 10), align 1
476  %ld11 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 11), align 1
477  %ld12 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 12), align 1
478  %ld13 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 13), align 1
479  %ld14 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 14), align 1
480  %ld15 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 15), align 1
481  %ld16 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 16), align 1
482  %ld17 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 17), align 1
483  %ld18 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 18), align 1
484  %ld19 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 19), align 1
485  %ld20 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 20), align 1
486  %ld21 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 21), align 1
487  %ld22 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 22), align 1
488  %ld23 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 23), align 1
489  %ld24 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 24), align 1
490  %ld25 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 25), align 1
491  %ld26 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 26), align 1
492  %ld27 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 27), align 1
493  %ld28 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 28), align 1
494  %ld29 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 29), align 1
495  %ld30 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 30), align 1
496  %ld31 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 31), align 1
497  %ctlz0  = call i8 @llvm.ctlz.i8(i8 %ld0, i1 0)
498  %ctlz1  = call i8 @llvm.ctlz.i8(i8 %ld1, i1 0)
499  %ctlz2  = call i8 @llvm.ctlz.i8(i8 %ld2, i1 0)
500  %ctlz3  = call i8 @llvm.ctlz.i8(i8 %ld3, i1 0)
501  %ctlz4  = call i8 @llvm.ctlz.i8(i8 %ld4, i1 0)
502  %ctlz5  = call i8 @llvm.ctlz.i8(i8 %ld5, i1 0)
503  %ctlz6  = call i8 @llvm.ctlz.i8(i8 %ld6, i1 0)
504  %ctlz7  = call i8 @llvm.ctlz.i8(i8 %ld7, i1 0)
505  %ctlz8  = call i8 @llvm.ctlz.i8(i8 %ld8, i1 0)
506  %ctlz9  = call i8 @llvm.ctlz.i8(i8 %ld9, i1 0)
507  %ctlz10 = call i8 @llvm.ctlz.i8(i8 %ld10, i1 0)
508  %ctlz11 = call i8 @llvm.ctlz.i8(i8 %ld11, i1 0)
509  %ctlz12 = call i8 @llvm.ctlz.i8(i8 %ld12, i1 0)
510  %ctlz13 = call i8 @llvm.ctlz.i8(i8 %ld13, i1 0)
511  %ctlz14 = call i8 @llvm.ctlz.i8(i8 %ld14, i1 0)
512  %ctlz15 = call i8 @llvm.ctlz.i8(i8 %ld15, i1 0)
513  %ctlz16 = call i8 @llvm.ctlz.i8(i8 %ld16, i1 0)
514  %ctlz17 = call i8 @llvm.ctlz.i8(i8 %ld17, i1 0)
515  %ctlz18 = call i8 @llvm.ctlz.i8(i8 %ld18, i1 0)
516  %ctlz19 = call i8 @llvm.ctlz.i8(i8 %ld19, i1 0)
517  %ctlz20 = call i8 @llvm.ctlz.i8(i8 %ld20, i1 0)
518  %ctlz21 = call i8 @llvm.ctlz.i8(i8 %ld21, i1 0)
519  %ctlz22 = call i8 @llvm.ctlz.i8(i8 %ld22, i1 0)
520  %ctlz23 = call i8 @llvm.ctlz.i8(i8 %ld23, i1 0)
521  %ctlz24 = call i8 @llvm.ctlz.i8(i8 %ld24, i1 0)
522  %ctlz25 = call i8 @llvm.ctlz.i8(i8 %ld25, i1 0)
523  %ctlz26 = call i8 @llvm.ctlz.i8(i8 %ld26, i1 0)
524  %ctlz27 = call i8 @llvm.ctlz.i8(i8 %ld27, i1 0)
525  %ctlz28 = call i8 @llvm.ctlz.i8(i8 %ld28, i1 0)
526  %ctlz29 = call i8 @llvm.ctlz.i8(i8 %ld29, i1 0)
527  %ctlz30 = call i8 @llvm.ctlz.i8(i8 %ld30, i1 0)
528  %ctlz31 = call i8 @llvm.ctlz.i8(i8 %ld31, i1 0)
529  store i8 %ctlz0 , ptr @dst8, align 1
530  store i8 %ctlz1 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  1), align 1
531  store i8 %ctlz2 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  2), align 1
532  store i8 %ctlz3 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  3), align 1
533  store i8 %ctlz4 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  4), align 1
534  store i8 %ctlz5 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  5), align 1
535  store i8 %ctlz6 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  6), align 1
536  store i8 %ctlz7 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  7), align 1
537  store i8 %ctlz8 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  8), align 1
538  store i8 %ctlz9 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  9), align 1
539  store i8 %ctlz10, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 10), align 1
540  store i8 %ctlz11, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 11), align 1
541  store i8 %ctlz12, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 12), align 1
542  store i8 %ctlz13, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 13), align 1
543  store i8 %ctlz14, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 14), align 1
544  store i8 %ctlz15, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 15), align 1
545  store i8 %ctlz16, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 16), align 1
546  store i8 %ctlz17, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 17), align 1
547  store i8 %ctlz18, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 18), align 1
548  store i8 %ctlz19, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 19), align 1
549  store i8 %ctlz20, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 20), align 1
550  store i8 %ctlz21, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 21), align 1
551  store i8 %ctlz22, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 22), align 1
552  store i8 %ctlz23, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 23), align 1
553  store i8 %ctlz24, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 24), align 1
554  store i8 %ctlz25, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 25), align 1
555  store i8 %ctlz26, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 26), align 1
556  store i8 %ctlz27, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 27), align 1
557  store i8 %ctlz28, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 28), align 1
558  store i8 %ctlz29, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 29), align 1
559  store i8 %ctlz30, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 30), align 1
560  store i8 %ctlz31, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 31), align 1
561  ret void
562}
563
564;
565; CTLZ_ZERO_UNDEF
566;
567
568define void @ctlz_undef_2i64() #0 {
569; SSE-LABEL: @ctlz_undef_2i64(
570; SSE-NEXT:    [[LD0:%.*]] = load i64, ptr @src64, align 8
571; SSE-NEXT:    [[LD1:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i32 0, i64 1), align 8
572; SSE-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 true)
573; SSE-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 true)
574; SSE-NEXT:    store i64 [[CTLZ0]], ptr @dst64, align 8
575; SSE-NEXT:    store i64 [[CTLZ1]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i32 0, i64 1), align 8
576; SSE-NEXT:    ret void
577;
578; AVX1-LABEL: @ctlz_undef_2i64(
579; AVX1-NEXT:    [[LD0:%.*]] = load i64, ptr @src64, align 8
580; AVX1-NEXT:    [[LD1:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i32 0, i64 1), align 8
581; AVX1-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 true)
582; AVX1-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 true)
583; AVX1-NEXT:    store i64 [[CTLZ0]], ptr @dst64, align 8
584; AVX1-NEXT:    store i64 [[CTLZ1]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i32 0, i64 1), align 8
585; AVX1-NEXT:    ret void
586;
587; AVX2-LABEL: @ctlz_undef_2i64(
588; AVX2-NEXT:    [[LD0:%.*]] = load i64, ptr @src64, align 8
589; AVX2-NEXT:    [[LD1:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i32 0, i64 1), align 8
590; AVX2-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 true)
591; AVX2-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 true)
592; AVX2-NEXT:    store i64 [[CTLZ0]], ptr @dst64, align 8
593; AVX2-NEXT:    store i64 [[CTLZ1]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i32 0, i64 1), align 8
594; AVX2-NEXT:    ret void
595;
596; AVX512-LABEL: @ctlz_undef_2i64(
597; AVX512-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr @src64, align 8
598; AVX512-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> [[TMP1]], i1 true)
599; AVX512-NEXT:    store <2 x i64> [[TMP2]], ptr @dst64, align 8
600; AVX512-NEXT:    ret void
601;
602  %ld0 = load i64, ptr @src64, align 8
603  %ld1 = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i32 0, i64 1), align 8
604  %ctlz0 = call i64 @llvm.ctlz.i64(i64 %ld0, i1 -1)
605  %ctlz1 = call i64 @llvm.ctlz.i64(i64 %ld1, i1 -1)
606  store i64 %ctlz0, ptr @dst64, align 8
607  store i64 %ctlz1, ptr getelementptr inbounds ([4 x i64], ptr @dst64, i32 0, i64 1), align 8
608  ret void
609}
610
611define void @ctlz_undef_4i64() #0 {
612; SSE-LABEL: @ctlz_undef_4i64(
613; SSE-NEXT:    [[LD0:%.*]] = load i64, ptr @src64, align 4
614; SSE-NEXT:    [[LD1:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 1), align 4
615; SSE-NEXT:    [[LD2:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 2), align 4
616; SSE-NEXT:    [[LD3:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 3), align 4
617; SSE-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 true)
618; SSE-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 true)
619; SSE-NEXT:    [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 true)
620; SSE-NEXT:    [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 true)
621; SSE-NEXT:    store i64 [[CTLZ0]], ptr @dst64, align 4
622; SSE-NEXT:    store i64 [[CTLZ1]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i64 0, i64 1), align 4
623; SSE-NEXT:    store i64 [[CTLZ2]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i64 0, i64 2), align 4
624; SSE-NEXT:    store i64 [[CTLZ3]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i64 0, i64 3), align 4
625; SSE-NEXT:    ret void
626;
627; AVX1-LABEL: @ctlz_undef_4i64(
628; AVX1-NEXT:    [[LD0:%.*]] = load i64, ptr @src64, align 4
629; AVX1-NEXT:    [[LD1:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 1), align 4
630; AVX1-NEXT:    [[LD2:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 2), align 4
631; AVX1-NEXT:    [[LD3:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 3), align 4
632; AVX1-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 true)
633; AVX1-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 true)
634; AVX1-NEXT:    [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 true)
635; AVX1-NEXT:    [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 true)
636; AVX1-NEXT:    store i64 [[CTLZ0]], ptr @dst64, align 4
637; AVX1-NEXT:    store i64 [[CTLZ1]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i64 0, i64 1), align 4
638; AVX1-NEXT:    store i64 [[CTLZ2]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i64 0, i64 2), align 4
639; AVX1-NEXT:    store i64 [[CTLZ3]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i64 0, i64 3), align 4
640; AVX1-NEXT:    ret void
641;
642; AVX2-LABEL: @ctlz_undef_4i64(
643; AVX2-NEXT:    [[LD0:%.*]] = load i64, ptr @src64, align 4
644; AVX2-NEXT:    [[LD1:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 1), align 4
645; AVX2-NEXT:    [[LD2:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 2), align 4
646; AVX2-NEXT:    [[LD3:%.*]] = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 3), align 4
647; AVX2-NEXT:    [[CTLZ0:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD0]], i1 true)
648; AVX2-NEXT:    [[CTLZ1:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD1]], i1 true)
649; AVX2-NEXT:    [[CTLZ2:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD2]], i1 true)
650; AVX2-NEXT:    [[CTLZ3:%.*]] = call i64 @llvm.ctlz.i64(i64 [[LD3]], i1 true)
651; AVX2-NEXT:    store i64 [[CTLZ0]], ptr @dst64, align 4
652; AVX2-NEXT:    store i64 [[CTLZ1]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i64 0, i64 1), align 4
653; AVX2-NEXT:    store i64 [[CTLZ2]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i64 0, i64 2), align 4
654; AVX2-NEXT:    store i64 [[CTLZ3]], ptr getelementptr inbounds ([4 x i64], ptr @dst64, i64 0, i64 3), align 4
655; AVX2-NEXT:    ret void
656;
657; AVX512-LABEL: @ctlz_undef_4i64(
658; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i64>, ptr @src64, align 4
659; AVX512-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> [[TMP1]], i1 true)
660; AVX512-NEXT:    store <4 x i64> [[TMP2]], ptr @dst64, align 4
661; AVX512-NEXT:    ret void
662;
663  %ld0 = load i64, ptr @src64, align 4
664  %ld1 = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 1), align 4
665  %ld2 = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 2), align 4
666  %ld3 = load i64, ptr getelementptr inbounds ([4 x i64], ptr @src64, i64 0, i64 3), align 4
667  %ctlz0 = call i64 @llvm.ctlz.i64(i64 %ld0, i1 -1)
668  %ctlz1 = call i64 @llvm.ctlz.i64(i64 %ld1, i1 -1)
669  %ctlz2 = call i64 @llvm.ctlz.i64(i64 %ld2, i1 -1)
670  %ctlz3 = call i64 @llvm.ctlz.i64(i64 %ld3, i1 -1)
671  store i64 %ctlz0, ptr @dst64, align 4
672  store i64 %ctlz1, ptr getelementptr inbounds ([4 x i64], ptr @dst64, i64 0, i64 1), align 4
673  store i64 %ctlz2, ptr getelementptr inbounds ([4 x i64], ptr @dst64, i64 0, i64 2), align 4
674  store i64 %ctlz3, ptr getelementptr inbounds ([4 x i64], ptr @dst64, i64 0, i64 3), align 4
675  ret void
676}
677
678define void @ctlz_undef_4i32() #0 {
679; SSE-LABEL: @ctlz_undef_4i32(
680; SSE-NEXT:    [[LD0:%.*]] = load i32, ptr @src32, align 4
681; SSE-NEXT:    [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
682; SSE-NEXT:    [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4
683; SSE-NEXT:    [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4
684; SSE-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 true)
685; SSE-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 true)
686; SSE-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 true)
687; SSE-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 true)
688; SSE-NEXT:    store i32 [[CTLZ0]], ptr @dst32, align 4
689; SSE-NEXT:    store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4
690; SSE-NEXT:    store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4
691; SSE-NEXT:    store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4
692; SSE-NEXT:    ret void
693;
694; AVX1-LABEL: @ctlz_undef_4i32(
695; AVX1-NEXT:    [[LD0:%.*]] = load i32, ptr @src32, align 4
696; AVX1-NEXT:    [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
697; AVX1-NEXT:    [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4
698; AVX1-NEXT:    [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4
699; AVX1-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 true)
700; AVX1-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 true)
701; AVX1-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 true)
702; AVX1-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 true)
703; AVX1-NEXT:    store i32 [[CTLZ0]], ptr @dst32, align 4
704; AVX1-NEXT:    store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4
705; AVX1-NEXT:    store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4
706; AVX1-NEXT:    store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4
707; AVX1-NEXT:    ret void
708;
709; AVX2-LABEL: @ctlz_undef_4i32(
710; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
711; AVX2-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 true)
712; AVX2-NEXT:    store <4 x i32> [[TMP2]], ptr @dst32, align 4
713; AVX2-NEXT:    ret void
714;
715; AVX512-LABEL: @ctlz_undef_4i32(
716; AVX512-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr @src32, align 4
717; AVX512-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> [[TMP1]], i1 true)
718; AVX512-NEXT:    store <4 x i32> [[TMP2]], ptr @dst32, align 4
719; AVX512-NEXT:    ret void
720;
721  %ld0 = load i32, ptr @src32, align 4
722  %ld1 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 4
723  %ld2 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 4
724  %ld3 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 4
725  %ctlz0 = call i32 @llvm.ctlz.i32(i32 %ld0, i1 -1)
726  %ctlz1 = call i32 @llvm.ctlz.i32(i32 %ld1, i1 -1)
727  %ctlz2 = call i32 @llvm.ctlz.i32(i32 %ld2, i1 -1)
728  %ctlz3 = call i32 @llvm.ctlz.i32(i32 %ld3, i1 -1)
729  store i32 %ctlz0, ptr @dst32, align 4
730  store i32 %ctlz1, ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 4
731  store i32 %ctlz2, ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 4
732  store i32 %ctlz3, ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 4
733  ret void
734}
735
736define void @ctlz_undef_8i32() #0 {
737; SSE-LABEL: @ctlz_undef_8i32(
738; SSE-NEXT:    [[LD0:%.*]] = load i32, ptr @src32, align 2
739; SSE-NEXT:    [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2
740; SSE-NEXT:    [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 2
741; SSE-NEXT:    [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 2
742; SSE-NEXT:    [[LD4:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2
743; SSE-NEXT:    [[LD5:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 5), align 2
744; SSE-NEXT:    [[LD6:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 6), align 2
745; SSE-NEXT:    [[LD7:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 7), align 2
746; SSE-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 true)
747; SSE-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 true)
748; SSE-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 true)
749; SSE-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 true)
750; SSE-NEXT:    [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 true)
751; SSE-NEXT:    [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 true)
752; SSE-NEXT:    [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 true)
753; SSE-NEXT:    [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 true)
754; SSE-NEXT:    store i32 [[CTLZ0]], ptr @dst32, align 2
755; SSE-NEXT:    store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 2
756; SSE-NEXT:    store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 2
757; SSE-NEXT:    store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 2
758; SSE-NEXT:    store i32 [[CTLZ4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2
759; SSE-NEXT:    store i32 [[CTLZ5]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 5), align 2
760; SSE-NEXT:    store i32 [[CTLZ6]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 6), align 2
761; SSE-NEXT:    store i32 [[CTLZ7]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2
762; SSE-NEXT:    ret void
763;
764; AVX1-LABEL: @ctlz_undef_8i32(
765; AVX1-NEXT:    [[LD0:%.*]] = load i32, ptr @src32, align 2
766; AVX1-NEXT:    [[LD1:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2
767; AVX1-NEXT:    [[LD2:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 2
768; AVX1-NEXT:    [[LD3:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 2
769; AVX1-NEXT:    [[LD4:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2
770; AVX1-NEXT:    [[LD5:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 5), align 2
771; AVX1-NEXT:    [[LD6:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 6), align 2
772; AVX1-NEXT:    [[LD7:%.*]] = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 7), align 2
773; AVX1-NEXT:    [[CTLZ0:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD0]], i1 true)
774; AVX1-NEXT:    [[CTLZ1:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD1]], i1 true)
775; AVX1-NEXT:    [[CTLZ2:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD2]], i1 true)
776; AVX1-NEXT:    [[CTLZ3:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD3]], i1 true)
777; AVX1-NEXT:    [[CTLZ4:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD4]], i1 true)
778; AVX1-NEXT:    [[CTLZ5:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD5]], i1 true)
779; AVX1-NEXT:    [[CTLZ6:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD6]], i1 true)
780; AVX1-NEXT:    [[CTLZ7:%.*]] = call i32 @llvm.ctlz.i32(i32 [[LD7]], i1 true)
781; AVX1-NEXT:    store i32 [[CTLZ0]], ptr @dst32, align 2
782; AVX1-NEXT:    store i32 [[CTLZ1]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 2
783; AVX1-NEXT:    store i32 [[CTLZ2]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 2
784; AVX1-NEXT:    store i32 [[CTLZ3]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 2
785; AVX1-NEXT:    store i32 [[CTLZ4]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2
786; AVX1-NEXT:    store i32 [[CTLZ5]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 5), align 2
787; AVX1-NEXT:    store i32 [[CTLZ6]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 6), align 2
788; AVX1-NEXT:    store i32 [[CTLZ7]], ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2
789; AVX1-NEXT:    ret void
790;
791; AVX2-LABEL: @ctlz_undef_8i32(
792; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2
793; AVX2-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> [[TMP1]], i1 true)
794; AVX2-NEXT:    store <8 x i32> [[TMP2]], ptr @dst32, align 2
795; AVX2-NEXT:    ret void
796;
797; AVX512-LABEL: @ctlz_undef_8i32(
798; AVX512-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr @src32, align 2
799; AVX512-NEXT:    [[TMP2:%.*]] = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> [[TMP1]], i1 true)
800; AVX512-NEXT:    store <8 x i32> [[TMP2]], ptr @dst32, align 2
801; AVX512-NEXT:    ret void
802;
803  %ld0 = load i32, ptr @src32, align 2
804  %ld1 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 1), align 2
805  %ld2 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 2), align 2
806  %ld3 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 3), align 2
807  %ld4 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 4), align 2
808  %ld5 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 5), align 2
809  %ld6 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 6), align 2
810  %ld7 = load i32, ptr getelementptr inbounds ([8 x i32], ptr @src32, i32 0, i64 7), align 2
811  %ctlz0 = call i32 @llvm.ctlz.i32(i32 %ld0, i1 -1)
812  %ctlz1 = call i32 @llvm.ctlz.i32(i32 %ld1, i1 -1)
813  %ctlz2 = call i32 @llvm.ctlz.i32(i32 %ld2, i1 -1)
814  %ctlz3 = call i32 @llvm.ctlz.i32(i32 %ld3, i1 -1)
815  %ctlz4 = call i32 @llvm.ctlz.i32(i32 %ld4, i1 -1)
816  %ctlz5 = call i32 @llvm.ctlz.i32(i32 %ld5, i1 -1)
817  %ctlz6 = call i32 @llvm.ctlz.i32(i32 %ld6, i1 -1)
818  %ctlz7 = call i32 @llvm.ctlz.i32(i32 %ld7, i1 -1)
819  store i32 %ctlz0, ptr @dst32, align 2
820  store i32 %ctlz1, ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 1), align 2
821  store i32 %ctlz2, ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 2), align 2
822  store i32 %ctlz3, ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 3), align 2
823  store i32 %ctlz4, ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 4), align 2
824  store i32 %ctlz5, ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 5), align 2
825  store i32 %ctlz6, ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 6), align 2
826  store i32 %ctlz7, ptr getelementptr inbounds ([8 x i32], ptr @dst32, i32 0, i64 7), align 2
827  ret void
828}
829
830define void @ctlz_undef_8i16() #0 {
831; CHECK-LABEL: @ctlz_undef_8i16(
832; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @src16, align 2
833; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 true)
834; CHECK-NEXT:    store <8 x i16> [[TMP2]], ptr @dst16, align 2
835; CHECK-NEXT:    ret void
836;
837  %ld0 = load i16, ptr @src16, align 2
838  %ld1 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 1), align 2
839  %ld2 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 2), align 2
840  %ld3 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 3), align 2
841  %ld4 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 4), align 2
842  %ld5 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 5), align 2
843  %ld6 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 6), align 2
844  %ld7 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 7), align 2
845  %ctlz0 = call i16 @llvm.ctlz.i16(i16 %ld0, i1 -1)
846  %ctlz1 = call i16 @llvm.ctlz.i16(i16 %ld1, i1 -1)
847  %ctlz2 = call i16 @llvm.ctlz.i16(i16 %ld2, i1 -1)
848  %ctlz3 = call i16 @llvm.ctlz.i16(i16 %ld3, i1 -1)
849  %ctlz4 = call i16 @llvm.ctlz.i16(i16 %ld4, i1 -1)
850  %ctlz5 = call i16 @llvm.ctlz.i16(i16 %ld5, i1 -1)
851  %ctlz6 = call i16 @llvm.ctlz.i16(i16 %ld6, i1 -1)
852  %ctlz7 = call i16 @llvm.ctlz.i16(i16 %ld7, i1 -1)
853  store i16 %ctlz0, ptr @dst16, align 2
854  store i16 %ctlz1, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 1), align 2
855  store i16 %ctlz2, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 2), align 2
856  store i16 %ctlz3, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 3), align 2
857  store i16 %ctlz4, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 4), align 2
858  store i16 %ctlz5, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 5), align 2
859  store i16 %ctlz6, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 6), align 2
860  store i16 %ctlz7, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 7), align 2
861  ret void
862}
863
864define void @ctlz_undef_16i16() #0 {
865; SSE-LABEL: @ctlz_undef_16i16(
866; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr @src16, align 2
867; SSE-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP1]], i1 true)
868; SSE-NEXT:    store <8 x i16> [[TMP2]], ptr @dst16, align 2
869; SSE-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 8), align 2
870; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> [[TMP3]], i1 true)
871; SSE-NEXT:    store <8 x i16> [[TMP4]], ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 8), align 2
872; SSE-NEXT:    ret void
873;
874; AVX-LABEL: @ctlz_undef_16i16(
875; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i16>, ptr @src16, align 2
876; AVX-NEXT:    [[TMP2:%.*]] = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> [[TMP1]], i1 true)
877; AVX-NEXT:    store <16 x i16> [[TMP2]], ptr @dst16, align 2
878; AVX-NEXT:    ret void
879;
880  %ld0  = load i16, ptr @src16, align 2
881  %ld1  = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64  1), align 2
882  %ld2  = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64  2), align 2
883  %ld3  = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64  3), align 2
884  %ld4  = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64  4), align 2
885  %ld5  = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64  5), align 2
886  %ld6  = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64  6), align 2
887  %ld7  = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64  7), align 2
888  %ld8  = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64  8), align 2
889  %ld9  = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64  9), align 2
890  %ld10 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 10), align 2
891  %ld11 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 11), align 2
892  %ld12 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 12), align 2
893  %ld13 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 13), align 2
894  %ld14 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 14), align 2
895  %ld15 = load i16, ptr getelementptr inbounds ([16 x i16], ptr @src16, i16 0, i64 15), align 2
896  %ctlz0  = call i16 @llvm.ctlz.i16(i16 %ld0, i1 -1)
897  %ctlz1  = call i16 @llvm.ctlz.i16(i16 %ld1, i1 -1)
898  %ctlz2  = call i16 @llvm.ctlz.i16(i16 %ld2, i1 -1)
899  %ctlz3  = call i16 @llvm.ctlz.i16(i16 %ld3, i1 -1)
900  %ctlz4  = call i16 @llvm.ctlz.i16(i16 %ld4, i1 -1)
901  %ctlz5  = call i16 @llvm.ctlz.i16(i16 %ld5, i1 -1)
902  %ctlz6  = call i16 @llvm.ctlz.i16(i16 %ld6, i1 -1)
903  %ctlz7  = call i16 @llvm.ctlz.i16(i16 %ld7, i1 -1)
904  %ctlz8  = call i16 @llvm.ctlz.i16(i16 %ld8, i1 -1)
905  %ctlz9  = call i16 @llvm.ctlz.i16(i16 %ld9, i1 -1)
906  %ctlz10 = call i16 @llvm.ctlz.i16(i16 %ld10, i1 -1)
907  %ctlz11 = call i16 @llvm.ctlz.i16(i16 %ld11, i1 -1)
908  %ctlz12 = call i16 @llvm.ctlz.i16(i16 %ld12, i1 -1)
909  %ctlz13 = call i16 @llvm.ctlz.i16(i16 %ld13, i1 -1)
910  %ctlz14 = call i16 @llvm.ctlz.i16(i16 %ld14, i1 -1)
911  %ctlz15 = call i16 @llvm.ctlz.i16(i16 %ld15, i1 -1)
912  store i16 %ctlz0 , ptr @dst16, align 2
913  store i16 %ctlz1 , ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64  1), align 2
914  store i16 %ctlz2 , ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64  2), align 2
915  store i16 %ctlz3 , ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64  3), align 2
916  store i16 %ctlz4 , ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64  4), align 2
917  store i16 %ctlz5 , ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64  5), align 2
918  store i16 %ctlz6 , ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64  6), align 2
919  store i16 %ctlz7 , ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64  7), align 2
920  store i16 %ctlz8 , ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64  8), align 2
921  store i16 %ctlz9 , ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64  9), align 2
922  store i16 %ctlz10, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 10), align 2
923  store i16 %ctlz11, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 11), align 2
924  store i16 %ctlz12, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 12), align 2
925  store i16 %ctlz13, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 13), align 2
926  store i16 %ctlz14, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 14), align 2
927  store i16 %ctlz15, ptr getelementptr inbounds ([16 x i16], ptr @dst16, i16 0, i64 15), align 2
928  ret void
929}
930
931define void @ctlz_undef_16i8() #0 {
932; CHECK-LABEL: @ctlz_undef_16i8(
933; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @src8, align 1
934; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 true)
935; CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr @dst8, align 1
936; CHECK-NEXT:    ret void
937;
938  %ld0  = load i8, ptr @src8, align 1
939  %ld1  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  1), align 1
940  %ld2  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  2), align 1
941  %ld3  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  3), align 1
942  %ld4  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  4), align 1
943  %ld5  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  5), align 1
944  %ld6  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  6), align 1
945  %ld7  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  7), align 1
946  %ld8  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  8), align 1
947  %ld9  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  9), align 1
948  %ld10 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 10), align 1
949  %ld11 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 11), align 1
950  %ld12 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 12), align 1
951  %ld13 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 13), align 1
952  %ld14 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 14), align 1
953  %ld15 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 15), align 1
954  %ctlz0  = call i8 @llvm.ctlz.i8(i8 %ld0, i1 -1)
955  %ctlz1  = call i8 @llvm.ctlz.i8(i8 %ld1, i1 -1)
956  %ctlz2  = call i8 @llvm.ctlz.i8(i8 %ld2, i1 -1)
957  %ctlz3  = call i8 @llvm.ctlz.i8(i8 %ld3, i1 -1)
958  %ctlz4  = call i8 @llvm.ctlz.i8(i8 %ld4, i1 -1)
959  %ctlz5  = call i8 @llvm.ctlz.i8(i8 %ld5, i1 -1)
960  %ctlz6  = call i8 @llvm.ctlz.i8(i8 %ld6, i1 -1)
961  %ctlz7  = call i8 @llvm.ctlz.i8(i8 %ld7, i1 -1)
962  %ctlz8  = call i8 @llvm.ctlz.i8(i8 %ld8, i1 -1)
963  %ctlz9  = call i8 @llvm.ctlz.i8(i8 %ld9, i1 -1)
964  %ctlz10 = call i8 @llvm.ctlz.i8(i8 %ld10, i1 -1)
965  %ctlz11 = call i8 @llvm.ctlz.i8(i8 %ld11, i1 -1)
966  %ctlz12 = call i8 @llvm.ctlz.i8(i8 %ld12, i1 -1)
967  %ctlz13 = call i8 @llvm.ctlz.i8(i8 %ld13, i1 -1)
968  %ctlz14 = call i8 @llvm.ctlz.i8(i8 %ld14, i1 -1)
969  %ctlz15 = call i8 @llvm.ctlz.i8(i8 %ld15, i1 -1)
970  store i8 %ctlz0 , ptr @dst8, align 1
971  store i8 %ctlz1 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  1), align 1
972  store i8 %ctlz2 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  2), align 1
973  store i8 %ctlz3 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  3), align 1
974  store i8 %ctlz4 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  4), align 1
975  store i8 %ctlz5 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  5), align 1
976  store i8 %ctlz6 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  6), align 1
977  store i8 %ctlz7 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  7), align 1
978  store i8 %ctlz8 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  8), align 1
979  store i8 %ctlz9 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  9), align 1
980  store i8 %ctlz10, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 10), align 1
981  store i8 %ctlz11, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 11), align 1
982  store i8 %ctlz12, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 12), align 1
983  store i8 %ctlz13, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 13), align 1
984  store i8 %ctlz14, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 14), align 1
985  store i8 %ctlz15, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 15), align 1
986  ret void
987}
988
989define void @ctlz_undef_32i8() #0 {
990; SSE-LABEL: @ctlz_undef_32i8(
991; SSE-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr @src8, align 1
992; SSE-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP1]], i1 true)
993; SSE-NEXT:    store <16 x i8> [[TMP2]], ptr @dst8, align 1
994; SSE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 16), align 1
995; SSE-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> [[TMP3]], i1 true)
996; SSE-NEXT:    store <16 x i8> [[TMP4]], ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 16), align 1
997; SSE-NEXT:    ret void
998;
999; AVX-LABEL: @ctlz_undef_32i8(
1000; AVX-NEXT:    [[TMP1:%.*]] = load <32 x i8>, ptr @src8, align 1
1001; AVX-NEXT:    [[TMP2:%.*]] = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> [[TMP1]], i1 true)
1002; AVX-NEXT:    store <32 x i8> [[TMP2]], ptr @dst8, align 1
1003; AVX-NEXT:    ret void
1004;
1005  %ld0  = load i8, ptr @src8, align 1
1006  %ld1  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  1), align 1
1007  %ld2  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  2), align 1
1008  %ld3  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  3), align 1
1009  %ld4  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  4), align 1
1010  %ld5  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  5), align 1
1011  %ld6  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  6), align 1
1012  %ld7  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  7), align 1
1013  %ld8  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  8), align 1
1014  %ld9  = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64  9), align 1
1015  %ld10 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 10), align 1
1016  %ld11 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 11), align 1
1017  %ld12 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 12), align 1
1018  %ld13 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 13), align 1
1019  %ld14 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 14), align 1
1020  %ld15 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 15), align 1
1021  %ld16 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 16), align 1
1022  %ld17 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 17), align 1
1023  %ld18 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 18), align 1
1024  %ld19 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 19), align 1
1025  %ld20 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 20), align 1
1026  %ld21 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 21), align 1
1027  %ld22 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 22), align 1
1028  %ld23 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 23), align 1
1029  %ld24 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 24), align 1
1030  %ld25 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 25), align 1
1031  %ld26 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 26), align 1
1032  %ld27 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 27), align 1
1033  %ld28 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 28), align 1
1034  %ld29 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 29), align 1
1035  %ld30 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 30), align 1
1036  %ld31 = load i8, ptr getelementptr inbounds ([32 x i8], ptr @src8, i8 0, i64 31), align 1
1037  %ctlz0  = call i8 @llvm.ctlz.i8(i8 %ld0, i1 -1)
1038  %ctlz1  = call i8 @llvm.ctlz.i8(i8 %ld1, i1 -1)
1039  %ctlz2  = call i8 @llvm.ctlz.i8(i8 %ld2, i1 -1)
1040  %ctlz3  = call i8 @llvm.ctlz.i8(i8 %ld3, i1 -1)
1041  %ctlz4  = call i8 @llvm.ctlz.i8(i8 %ld4, i1 -1)
1042  %ctlz5  = call i8 @llvm.ctlz.i8(i8 %ld5, i1 -1)
1043  %ctlz6  = call i8 @llvm.ctlz.i8(i8 %ld6, i1 -1)
1044  %ctlz7  = call i8 @llvm.ctlz.i8(i8 %ld7, i1 -1)
1045  %ctlz8  = call i8 @llvm.ctlz.i8(i8 %ld8, i1 -1)
1046  %ctlz9  = call i8 @llvm.ctlz.i8(i8 %ld9, i1 -1)
1047  %ctlz10 = call i8 @llvm.ctlz.i8(i8 %ld10, i1 -1)
1048  %ctlz11 = call i8 @llvm.ctlz.i8(i8 %ld11, i1 -1)
1049  %ctlz12 = call i8 @llvm.ctlz.i8(i8 %ld12, i1 -1)
1050  %ctlz13 = call i8 @llvm.ctlz.i8(i8 %ld13, i1 -1)
1051  %ctlz14 = call i8 @llvm.ctlz.i8(i8 %ld14, i1 -1)
1052  %ctlz15 = call i8 @llvm.ctlz.i8(i8 %ld15, i1 -1)
1053  %ctlz16 = call i8 @llvm.ctlz.i8(i8 %ld16, i1 -1)
1054  %ctlz17 = call i8 @llvm.ctlz.i8(i8 %ld17, i1 -1)
1055  %ctlz18 = call i8 @llvm.ctlz.i8(i8 %ld18, i1 -1)
1056  %ctlz19 = call i8 @llvm.ctlz.i8(i8 %ld19, i1 -1)
1057  %ctlz20 = call i8 @llvm.ctlz.i8(i8 %ld20, i1 -1)
1058  %ctlz21 = call i8 @llvm.ctlz.i8(i8 %ld21, i1 -1)
1059  %ctlz22 = call i8 @llvm.ctlz.i8(i8 %ld22, i1 -1)
1060  %ctlz23 = call i8 @llvm.ctlz.i8(i8 %ld23, i1 -1)
1061  %ctlz24 = call i8 @llvm.ctlz.i8(i8 %ld24, i1 -1)
1062  %ctlz25 = call i8 @llvm.ctlz.i8(i8 %ld25, i1 -1)
1063  %ctlz26 = call i8 @llvm.ctlz.i8(i8 %ld26, i1 -1)
1064  %ctlz27 = call i8 @llvm.ctlz.i8(i8 %ld27, i1 -1)
1065  %ctlz28 = call i8 @llvm.ctlz.i8(i8 %ld28, i1 -1)
1066  %ctlz29 = call i8 @llvm.ctlz.i8(i8 %ld29, i1 -1)
1067  %ctlz30 = call i8 @llvm.ctlz.i8(i8 %ld30, i1 -1)
1068  %ctlz31 = call i8 @llvm.ctlz.i8(i8 %ld31, i1 -1)
1069  store i8 %ctlz0 , ptr @dst8, align 1
1070  store i8 %ctlz1 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  1), align 1
1071  store i8 %ctlz2 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  2), align 1
1072  store i8 %ctlz3 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  3), align 1
1073  store i8 %ctlz4 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  4), align 1
1074  store i8 %ctlz5 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  5), align 1
1075  store i8 %ctlz6 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  6), align 1
1076  store i8 %ctlz7 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  7), align 1
1077  store i8 %ctlz8 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  8), align 1
1078  store i8 %ctlz9 , ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64  9), align 1
1079  store i8 %ctlz10, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 10), align 1
1080  store i8 %ctlz11, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 11), align 1
1081  store i8 %ctlz12, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 12), align 1
1082  store i8 %ctlz13, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 13), align 1
1083  store i8 %ctlz14, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 14), align 1
1084  store i8 %ctlz15, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 15), align 1
1085  store i8 %ctlz16, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 16), align 1
1086  store i8 %ctlz17, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 17), align 1
1087  store i8 %ctlz18, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 18), align 1
1088  store i8 %ctlz19, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 19), align 1
1089  store i8 %ctlz20, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 20), align 1
1090  store i8 %ctlz21, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 21), align 1
1091  store i8 %ctlz22, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 22), align 1
1092  store i8 %ctlz23, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 23), align 1
1093  store i8 %ctlz24, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 24), align 1
1094  store i8 %ctlz25, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 25), align 1
1095  store i8 %ctlz26, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 26), align 1
1096  store i8 %ctlz27, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 27), align 1
1097  store i8 %ctlz28, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 28), align 1
1098  store i8 %ctlz29, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 29), align 1
1099  store i8 %ctlz30, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 30), align 1
1100  store i8 %ctlz31, ptr getelementptr inbounds ([32 x i8], ptr @dst8, i8 0, i64 31), align 1
1101  ret void
1102}
1103
1104attributes #0 = { nounwind }
1105;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
1106; SSE2: {{.*}}
1107; SSE4: {{.*}}
1108