xref: /llvm-project/llvm/test/CodeGen/X86/sttni.ll (revision e975ff0a223e79842b693e0ec4d3cac87963869a)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X86
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64
4
5declare i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8)
6declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8)
7declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32, <16 x i8>, i32, i8)
8declare i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8>, i8)
9declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8>, i8)
10declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8>, i8)
11
12define i1 @pcmpestri_reg_eq_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind {
13; X86-LABEL: pcmpestri_reg_eq_i8:
14; X86:       # %bb.0: # %entry
15; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
16; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
17; X86-NEXT:    pcmpestri $24, %xmm1, %xmm0
18; X86-NEXT:    setae %al
19; X86-NEXT:    retl
20;
21; X64-LABEL: pcmpestri_reg_eq_i8:
22; X64:       # %bb.0: # %entry
23; X64-NEXT:    movl %esi, %edx
24; X64-NEXT:    movl %edi, %eax
25; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
26; X64-NEXT:    setae %al
27; X64-NEXT:    retq
28entry:
29  %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
30  %result = icmp eq i32 %c, 0
31  ret i1 %result
32}
33
34define i32 @pcmpestri_reg_idx_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind {
35; X86-LABEL: pcmpestri_reg_idx_i8:
36; X86:       # %bb.0: # %entry
37; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
38; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
39; X86-NEXT:    pcmpestri $24, %xmm1, %xmm0
40; X86-NEXT:    movl %ecx, %eax
41; X86-NEXT:    retl
42;
43; X64-LABEL: pcmpestri_reg_idx_i8:
44; X64:       # %bb.0: # %entry
45; X64-NEXT:    movl %esi, %edx
46; X64-NEXT:    movl %edi, %eax
47; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
48; X64-NEXT:    movl %ecx, %eax
49; X64-NEXT:    retq
50entry:
51  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
52  ret i32 %idx
53}
54
55define i32 @pcmpestri_reg_diff_i8(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len) nounwind {
56; X86-LABEL: pcmpestri_reg_diff_i8:
57; X86:       # %bb.0: # %entry
58; X86-NEXT:    pushl %ebp
59; X86-NEXT:    movl %esp, %ebp
60; X86-NEXT:    andl $-16, %esp
61; X86-NEXT:    subl $48, %esp
62; X86-NEXT:    movl 8(%ebp), %eax
63; X86-NEXT:    movl 12(%ebp), %edx
64; X86-NEXT:    pcmpestri $24, %xmm1, %xmm0
65; X86-NEXT:    cmpl $16, %ecx
66; X86-NEXT:    jne .LBB2_2
67; X86-NEXT:  # %bb.1:
68; X86-NEXT:    xorl %eax, %eax
69; X86-NEXT:    jmp .LBB2_3
70; X86-NEXT:  .LBB2_2: # %compare
71; X86-NEXT:    movdqa %xmm0, (%esp)
72; X86-NEXT:    andl $15, %ecx
73; X86-NEXT:    movzbl (%esp,%ecx), %eax
74; X86-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp)
75; X86-NEXT:    subb 16(%esp,%ecx), %al
76; X86-NEXT:  .LBB2_3: # %exit
77; X86-NEXT:    movzbl %al, %eax
78; X86-NEXT:    movl %ebp, %esp
79; X86-NEXT:    popl %ebp
80; X86-NEXT:    retl
81;
82; X64-LABEL: pcmpestri_reg_diff_i8:
83; X64:       # %bb.0: # %entry
84; X64-NEXT:    movl %esi, %edx
85; X64-NEXT:    movl %edi, %eax
86; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
87; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
88; X64-NEXT:    cmpl $16, %ecx
89; X64-NEXT:    jne .LBB2_2
90; X64-NEXT:  # %bb.1:
91; X64-NEXT:    xorl %eax, %eax
92; X64-NEXT:    movzbl %al, %eax
93; X64-NEXT:    retq
94; X64-NEXT:  .LBB2_2: # %compare
95; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
96; X64-NEXT:    andl $15, %ecx
97; X64-NEXT:    movzbl -24(%rsp,%rcx), %eax
98; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
99; X64-NEXT:    subb -40(%rsp,%rcx), %al
100; X64-NEXT:    movzbl %al, %eax
101; X64-NEXT:    retq
102entry:
103  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
104  %eq = icmp eq i32 %idx, 16
105  br i1 %eq, label %exit, label %compare
106
107compare:
108  %lhs_c = extractelement <16 x i8> %lhs, i32 %idx
109  %rhs_c = extractelement <16 x i8> %rhs, i32 %idx
110  %sub = sub i8 %lhs_c, %rhs_c
111  br label %exit
112
113exit:
114  %result = phi i8 [ 0, %entry ], [ %sub, %compare ]
115  %result_ext = zext i8 %result to i32
116  ret i32 %result_ext
117}
118
119define i1 @pcmpestri_mem_eq_i8(ptr %lhs_ptr, i32 %lhs_len, ptr %rhs_ptr, i32 %rhs_len) nounwind {
120; X86-LABEL: pcmpestri_mem_eq_i8:
121; X86:       # %bb.0: # %entry
122; X86-NEXT:    pushl %esi
123; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
124; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
125; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
126; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
127; X86-NEXT:    movdqu (%esi), %xmm0
128; X86-NEXT:    pcmpestri $24, (%ecx), %xmm0
129; X86-NEXT:    setae %al
130; X86-NEXT:    popl %esi
131; X86-NEXT:    retl
132;
133; X64-LABEL: pcmpestri_mem_eq_i8:
134; X64:       # %bb.0: # %entry
135; X64-NEXT:    movq %rdx, %r8
136; X64-NEXT:    movl %esi, %eax
137; X64-NEXT:    movdqu (%rdi), %xmm0
138; X64-NEXT:    movl %ecx, %edx
139; X64-NEXT:    pcmpestri $24, (%r8), %xmm0
140; X64-NEXT:    setae %al
141; X64-NEXT:    retq
142entry:
143  %lhs = load <16 x i8>, ptr %lhs_ptr, align 1
144  %rhs = load <16 x i8>, ptr %rhs_ptr, align 1
145  %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
146  %result = icmp eq i32 %c, 0
147  ret i1 %result
148}
149
150define i32 @pcmpestri_mem_idx_i8(ptr %lhs_ptr, i32 %lhs_len, ptr %rhs_ptr, i32 %rhs_len) nounwind {
151; X86-LABEL: pcmpestri_mem_idx_i8:
152; X86:       # %bb.0: # %entry
153; X86-NEXT:    pushl %esi
154; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
155; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
156; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
157; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
158; X86-NEXT:    movdqu (%esi), %xmm0
159; X86-NEXT:    pcmpestri $24, (%ecx), %xmm0
160; X86-NEXT:    movl %ecx, %eax
161; X86-NEXT:    popl %esi
162; X86-NEXT:    retl
163;
164; X64-LABEL: pcmpestri_mem_idx_i8:
165; X64:       # %bb.0: # %entry
166; X64-NEXT:    movq %rdx, %r8
167; X64-NEXT:    movl %esi, %eax
168; X64-NEXT:    movdqu (%rdi), %xmm0
169; X64-NEXT:    movl %ecx, %edx
170; X64-NEXT:    pcmpestri $24, (%r8), %xmm0
171; X64-NEXT:    movl %ecx, %eax
172; X64-NEXT:    retq
173entry:
174  %lhs = load <16 x i8>, ptr %lhs_ptr, align 1
175  %rhs = load <16 x i8>, ptr %rhs_ptr, align 1
176  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
177  ret i32 %idx
178}
179
180define i32 @pcmpestri_mem_diff_i8(ptr %lhs_ptr, i32 %lhs_len, ptr %rhs_ptr, i32 %rhs_len) nounwind {
181; X86-LABEL: pcmpestri_mem_diff_i8:
182; X86:       # %bb.0: # %entry
183; X86-NEXT:    pushl %ebp
184; X86-NEXT:    movl %esp, %ebp
185; X86-NEXT:    pushl %esi
186; X86-NEXT:    andl $-16, %esp
187; X86-NEXT:    subl $48, %esp
188; X86-NEXT:    movl 12(%ebp), %eax
189; X86-NEXT:    movl 20(%ebp), %edx
190; X86-NEXT:    movl 16(%ebp), %ecx
191; X86-NEXT:    movl 8(%ebp), %esi
192; X86-NEXT:    movdqu (%esi), %xmm1
193; X86-NEXT:    movdqu (%ecx), %xmm0
194; X86-NEXT:    pcmpestri $24, %xmm0, %xmm1
195; X86-NEXT:    cmpl $16, %ecx
196; X86-NEXT:    jne .LBB5_2
197; X86-NEXT:  # %bb.1:
198; X86-NEXT:    xorl %eax, %eax
199; X86-NEXT:    jmp .LBB5_3
200; X86-NEXT:  .LBB5_2: # %compare
201; X86-NEXT:    movdqa %xmm1, (%esp)
202; X86-NEXT:    andl $15, %ecx
203; X86-NEXT:    movzbl (%esp,%ecx), %eax
204; X86-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
205; X86-NEXT:    subb 16(%esp,%ecx), %al
206; X86-NEXT:  .LBB5_3: # %exit
207; X86-NEXT:    movzbl %al, %eax
208; X86-NEXT:    leal -4(%ebp), %esp
209; X86-NEXT:    popl %esi
210; X86-NEXT:    popl %ebp
211; X86-NEXT:    retl
212;
213; X64-LABEL: pcmpestri_mem_diff_i8:
214; X64:       # %bb.0: # %entry
215; X64-NEXT:    movl %esi, %eax
216; X64-NEXT:    movdqu (%rdi), %xmm1
217; X64-NEXT:    movdqu (%rdx), %xmm0
218; X64-NEXT:    movl %ecx, %edx
219; X64-NEXT:    pcmpestri $24, %xmm0, %xmm1
220; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
221; X64-NEXT:    cmpl $16, %ecx
222; X64-NEXT:    jne .LBB5_2
223; X64-NEXT:  # %bb.1:
224; X64-NEXT:    xorl %eax, %eax
225; X64-NEXT:    movzbl %al, %eax
226; X64-NEXT:    retq
227; X64-NEXT:  .LBB5_2: # %compare
228; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
229; X64-NEXT:    andl $15, %ecx
230; X64-NEXT:    movzbl -24(%rsp,%rcx), %eax
231; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
232; X64-NEXT:    subb -40(%rsp,%rcx), %al
233; X64-NEXT:    movzbl %al, %eax
234; X64-NEXT:    retq
235entry:
236  %lhs = load <16 x i8>, ptr %lhs_ptr, align 1
237  %rhs = load <16 x i8>, ptr %rhs_ptr, align 1
238  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
239  %eq = icmp eq i32 %idx, 16
240  br i1 %eq, label %exit, label %compare
241
242compare:
243  %lhs_c = extractelement <16 x i8> %lhs, i32 %idx
244  %rhs_c = extractelement <16 x i8> %rhs, i32 %idx
245  %sub = sub i8 %lhs_c, %rhs_c
246  br label %exit
247
248exit:
249  %result = phi i8 [ 0, %entry ], [ %sub, %compare ]
250  %result_ext = zext i8 %result to i32
251  ret i32 %result_ext
252}
253
254define i1 @pcmpestri_reg_eq_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind {
255; X86-LABEL: pcmpestri_reg_eq_i16:
256; X86:       # %bb.0: # %entry
257; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
258; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
259; X86-NEXT:    pcmpestri $24, %xmm1, %xmm0
260; X86-NEXT:    setae %al
261; X86-NEXT:    retl
262;
263; X64-LABEL: pcmpestri_reg_eq_i16:
264; X64:       # %bb.0: # %entry
265; X64-NEXT:    movl %esi, %edx
266; X64-NEXT:    movl %edi, %eax
267; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
268; X64-NEXT:    setae %al
269; X64-NEXT:    retq
270entry:
271  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
272  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
273  %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24)
274  %result = icmp eq i32 %c, 0
275  ret i1 %result
276}
277
278define i32 @pcmpestri_reg_idx_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind {
279; X86-LABEL: pcmpestri_reg_idx_i16:
280; X86:       # %bb.0: # %entry
281; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
282; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
283; X86-NEXT:    pcmpestri $24, %xmm1, %xmm0
284; X86-NEXT:    movl %ecx, %eax
285; X86-NEXT:    retl
286;
287; X64-LABEL: pcmpestri_reg_idx_i16:
288; X64:       # %bb.0: # %entry
289; X64-NEXT:    movl %esi, %edx
290; X64-NEXT:    movl %edi, %eax
291; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
292; X64-NEXT:    movl %ecx, %eax
293; X64-NEXT:    retq
294entry:
295  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
296  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
297  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24)
298  ret i32 %idx
299}
300
301define i32 @pcmpestri_reg_diff_i16(<8 x i16> %lhs, i32 %lhs_len, <8 x i16> %rhs, i32 %rhs_len) nounwind {
302; X86-LABEL: pcmpestri_reg_diff_i16:
303; X86:       # %bb.0: # %entry
304; X86-NEXT:    pushl %ebp
305; X86-NEXT:    movl %esp, %ebp
306; X86-NEXT:    andl $-16, %esp
307; X86-NEXT:    subl $48, %esp
308; X86-NEXT:    movl 8(%ebp), %eax
309; X86-NEXT:    movl 12(%ebp), %edx
310; X86-NEXT:    pcmpestri $24, %xmm1, %xmm0
311; X86-NEXT:    cmpl $16, %ecx
312; X86-NEXT:    jne .LBB8_2
313; X86-NEXT:  # %bb.1:
314; X86-NEXT:    xorl %eax, %eax
315; X86-NEXT:    jmp .LBB8_3
316; X86-NEXT:  .LBB8_2: # %compare
317; X86-NEXT:    movdqa %xmm0, (%esp)
318; X86-NEXT:    addl %ecx, %ecx
319; X86-NEXT:    andl $14, %ecx
320; X86-NEXT:    movzwl (%esp,%ecx), %eax
321; X86-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp)
322; X86-NEXT:    subw 16(%esp,%ecx), %ax
323; X86-NEXT:  .LBB8_3: # %exit
324; X86-NEXT:    movzwl %ax, %eax
325; X86-NEXT:    movl %ebp, %esp
326; X86-NEXT:    popl %ebp
327; X86-NEXT:    retl
328;
329; X64-LABEL: pcmpestri_reg_diff_i16:
330; X64:       # %bb.0: # %entry
331; X64-NEXT:    movl %esi, %edx
332; X64-NEXT:    movl %edi, %eax
333; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
334; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
335; X64-NEXT:    cmpl $16, %ecx
336; X64-NEXT:    jne .LBB8_2
337; X64-NEXT:  # %bb.1:
338; X64-NEXT:    xorl %eax, %eax
339; X64-NEXT:    movzwl %ax, %eax
340; X64-NEXT:    retq
341; X64-NEXT:  .LBB8_2: # %compare
342; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
343; X64-NEXT:    andl $7, %ecx
344; X64-NEXT:    addl %ecx, %ecx
345; X64-NEXT:    movzwl -24(%rsp,%rcx), %eax
346; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
347; X64-NEXT:    subw -40(%rsp,%rcx), %ax
348; X64-NEXT:    movzwl %ax, %eax
349; X64-NEXT:    retq
350entry:
351  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
352  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
353  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 24)
354  %eq = icmp eq i32 %idx, 16
355  br i1 %eq, label %exit, label %compare
356
357compare:
358  %lhs_c = extractelement <8 x i16> %lhs, i32 %idx
359  %rhs_c = extractelement <8 x i16> %rhs, i32 %idx
360  %sub = sub i16 %lhs_c, %rhs_c
361  br label %exit
362
363exit:
364  %result = phi i16 [ 0, %entry ], [ %sub, %compare ]
365  %result_ext = zext i16 %result to i32
366  ret i32 %result_ext
367}
368
369define i1 @pcmpestri_mem_eq_i16(ptr %lhs_ptr, i32 %lhs_len, ptr %rhs_ptr, i32 %rhs_len) nounwind {
370; X86-LABEL: pcmpestri_mem_eq_i16:
371; X86:       # %bb.0: # %entry
372; X86-NEXT:    pushl %esi
373; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
374; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
375; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
376; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
377; X86-NEXT:    movdqu (%esi), %xmm0
378; X86-NEXT:    pcmpestri $25, (%ecx), %xmm0
379; X86-NEXT:    setae %al
380; X86-NEXT:    popl %esi
381; X86-NEXT:    retl
382;
383; X64-LABEL: pcmpestri_mem_eq_i16:
384; X64:       # %bb.0: # %entry
385; X64-NEXT:    movq %rdx, %r8
386; X64-NEXT:    movl %esi, %eax
387; X64-NEXT:    movdqu (%rdi), %xmm0
388; X64-NEXT:    movl %ecx, %edx
389; X64-NEXT:    pcmpestri $25, (%r8), %xmm0
390; X64-NEXT:    setae %al
391; X64-NEXT:    retq
392entry:
393  %lhs = load <8 x i16>, ptr %lhs_ptr, align 1
394  %rhs = load <8 x i16>, ptr %rhs_ptr, align 1
395  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
396  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
397  %c = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25)
398  %result = icmp eq i32 %c, 0
399  ret i1 %result
400}
401
402define i32 @pcmpestri_mem_idx_i16(ptr %lhs_ptr, i32 %lhs_len, ptr %rhs_ptr, i32 %rhs_len) nounwind {
403; X86-LABEL: pcmpestri_mem_idx_i16:
404; X86:       # %bb.0: # %entry
405; X86-NEXT:    pushl %esi
406; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
407; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
408; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
409; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
410; X86-NEXT:    movdqu (%esi), %xmm0
411; X86-NEXT:    pcmpestri $25, (%ecx), %xmm0
412; X86-NEXT:    movl %ecx, %eax
413; X86-NEXT:    popl %esi
414; X86-NEXT:    retl
415;
416; X64-LABEL: pcmpestri_mem_idx_i16:
417; X64:       # %bb.0: # %entry
418; X64-NEXT:    movq %rdx, %r8
419; X64-NEXT:    movl %esi, %eax
420; X64-NEXT:    movdqu (%rdi), %xmm0
421; X64-NEXT:    movl %ecx, %edx
422; X64-NEXT:    pcmpestri $25, (%r8), %xmm0
423; X64-NEXT:    movl %ecx, %eax
424; X64-NEXT:    retq
425entry:
426  %lhs = load <8 x i16>, ptr %lhs_ptr, align 1
427  %rhs = load <8 x i16>, ptr %rhs_ptr, align 1
428  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
429  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
430  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25)
431  ret i32 %idx
432}
433
434define i32 @pcmpestri_mem_diff_i16(ptr %lhs_ptr, i32 %lhs_len, ptr %rhs_ptr, i32 %rhs_len) nounwind {
435; X86-LABEL: pcmpestri_mem_diff_i16:
436; X86:       # %bb.0: # %entry
437; X86-NEXT:    pushl %ebp
438; X86-NEXT:    movl %esp, %ebp
439; X86-NEXT:    pushl %esi
440; X86-NEXT:    andl $-16, %esp
441; X86-NEXT:    subl $48, %esp
442; X86-NEXT:    movl 12(%ebp), %eax
443; X86-NEXT:    movl 20(%ebp), %edx
444; X86-NEXT:    movl 16(%ebp), %ecx
445; X86-NEXT:    movl 8(%ebp), %esi
446; X86-NEXT:    movdqu (%esi), %xmm1
447; X86-NEXT:    movdqu (%ecx), %xmm0
448; X86-NEXT:    pcmpestri $25, %xmm0, %xmm1
449; X86-NEXT:    cmpl $8, %ecx
450; X86-NEXT:    jne .LBB11_2
451; X86-NEXT:  # %bb.1:
452; X86-NEXT:    xorl %eax, %eax
453; X86-NEXT:    jmp .LBB11_3
454; X86-NEXT:  .LBB11_2: # %compare
455; X86-NEXT:    movdqa %xmm1, (%esp)
456; X86-NEXT:    addl %ecx, %ecx
457; X86-NEXT:    andl $14, %ecx
458; X86-NEXT:    movzwl (%esp,%ecx), %eax
459; X86-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
460; X86-NEXT:    subw 16(%esp,%ecx), %ax
461; X86-NEXT:  .LBB11_3: # %exit
462; X86-NEXT:    movzwl %ax, %eax
463; X86-NEXT:    leal -4(%ebp), %esp
464; X86-NEXT:    popl %esi
465; X86-NEXT:    popl %ebp
466; X86-NEXT:    retl
467;
468; X64-LABEL: pcmpestri_mem_diff_i16:
469; X64:       # %bb.0: # %entry
470; X64-NEXT:    movl %esi, %eax
471; X64-NEXT:    movdqu (%rdi), %xmm1
472; X64-NEXT:    movdqu (%rdx), %xmm0
473; X64-NEXT:    movl %ecx, %edx
474; X64-NEXT:    pcmpestri $25, %xmm0, %xmm1
475; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
476; X64-NEXT:    cmpl $8, %ecx
477; X64-NEXT:    jne .LBB11_2
478; X64-NEXT:  # %bb.1:
479; X64-NEXT:    xorl %eax, %eax
480; X64-NEXT:    movzwl %ax, %eax
481; X64-NEXT:    retq
482; X64-NEXT:  .LBB11_2: # %compare
483; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
484; X64-NEXT:    andl $7, %ecx
485; X64-NEXT:    addl %ecx, %ecx
486; X64-NEXT:    movzwl -24(%rsp,%rcx), %eax
487; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
488; X64-NEXT:    subw -40(%rsp,%rcx), %ax
489; X64-NEXT:    movzwl %ax, %eax
490; X64-NEXT:    retq
491entry:
492  %lhs = load <8 x i16>, ptr %lhs_ptr, align 1
493  %rhs = load <8 x i16>, ptr %rhs_ptr, align 1
494  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
495  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
496  %idx = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs_cast, i32 %lhs_len, <16 x i8> %rhs_cast, i32 %rhs_len, i8 25)
497  %eq = icmp eq i32 %idx, 8
498  br i1 %eq, label %exit, label %compare
499
500compare:
501  %lhs_c = extractelement <8 x i16> %lhs, i32 %idx
502  %rhs_c = extractelement <8 x i16> %rhs, i32 %idx
503  %sub = sub i16 %lhs_c, %rhs_c
504  br label %exit
505
506exit:
507  %result = phi i16 [ 0, %entry ], [ %sub, %compare ]
508  %result_ext = zext i16 %result to i32
509  ret i32 %result_ext
510}
511
512define i1 @pcmpistri_reg_eq_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind {
513; X86-LABEL: pcmpistri_reg_eq_i8:
514; X86:       # %bb.0: # %entry
515; X86-NEXT:    pcmpistri $24, %xmm1, %xmm0
516; X86-NEXT:    setae %al
517; X86-NEXT:    retl
518;
519; X64-LABEL: pcmpistri_reg_eq_i8:
520; X64:       # %bb.0: # %entry
521; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
522; X64-NEXT:    setae %al
523; X64-NEXT:    retq
524entry:
525  %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
526  %result = icmp eq i32 %c, 0
527  ret i1 %result
528}
529
530define i32 @pcmpistri_reg_idx_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind {
531; X86-LABEL: pcmpistri_reg_idx_i8:
532; X86:       # %bb.0: # %entry
533; X86-NEXT:    pcmpistri $24, %xmm1, %xmm0
534; X86-NEXT:    movl %ecx, %eax
535; X86-NEXT:    retl
536;
537; X64-LABEL: pcmpistri_reg_idx_i8:
538; X64:       # %bb.0: # %entry
539; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
540; X64-NEXT:    movl %ecx, %eax
541; X64-NEXT:    retq
542entry:
543  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
544  ret i32 %idx
545}
546
547define i32 @pcmpistri_reg_diff_i8(<16 x i8> %lhs, <16 x i8> %rhs) nounwind {
548; X86-LABEL: pcmpistri_reg_diff_i8:
549; X86:       # %bb.0: # %entry
550; X86-NEXT:    pcmpistri $24, %xmm1, %xmm0
551; X86-NEXT:    cmpl $16, %ecx
552; X86-NEXT:    jne .LBB14_2
553; X86-NEXT:  # %bb.1:
554; X86-NEXT:    xorl %eax, %eax
555; X86-NEXT:    movzbl %al, %eax
556; X86-NEXT:    retl
557; X86-NEXT:  .LBB14_2: # %compare
558; X86-NEXT:    pushl %ebp
559; X86-NEXT:    movl %esp, %ebp
560; X86-NEXT:    andl $-16, %esp
561; X86-NEXT:    subl $48, %esp
562; X86-NEXT:    movdqa %xmm0, (%esp)
563; X86-NEXT:    andl $15, %ecx
564; X86-NEXT:    movzbl (%esp,%ecx), %eax
565; X86-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp)
566; X86-NEXT:    subb 16(%esp,%ecx), %al
567; X86-NEXT:    movl %ebp, %esp
568; X86-NEXT:    popl %ebp
569; X86-NEXT:    movzbl %al, %eax
570; X86-NEXT:    retl
571;
572; X64-LABEL: pcmpistri_reg_diff_i8:
573; X64:       # %bb.0: # %entry
574; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
575; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
576; X64-NEXT:    cmpl $16, %ecx
577; X64-NEXT:    jne .LBB14_2
578; X64-NEXT:  # %bb.1:
579; X64-NEXT:    xorl %eax, %eax
580; X64-NEXT:    movzbl %al, %eax
581; X64-NEXT:    retq
582; X64-NEXT:  .LBB14_2: # %compare
583; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
584; X64-NEXT:    andl $15, %ecx
585; X64-NEXT:    movzbl -24(%rsp,%rcx), %eax
586; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
587; X64-NEXT:    subb -40(%rsp,%rcx), %al
588; X64-NEXT:    movzbl %al, %eax
589; X64-NEXT:    retq
590entry:
591  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
592  %eq = icmp eq i32 %idx, 16
593  br i1 %eq, label %exit, label %compare
594
595compare:
596  %lhs_c = extractelement <16 x i8> %lhs, i32 %idx
597  %rhs_c = extractelement <16 x i8> %rhs, i32 %idx
598  %sub = sub i8 %lhs_c, %rhs_c
599  br label %exit
600
601exit:
602  %result = phi i8 [ 0, %entry ], [ %sub, %compare ]
603  %result_ext = zext i8 %result to i32
604  ret i32 %result_ext
605}
606
607define i1 @pcmpistri_mem_eq_i8(ptr %lhs_ptr, ptr %rhs_ptr) nounwind {
608; X86-LABEL: pcmpistri_mem_eq_i8:
609; X86:       # %bb.0: # %entry
610; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
611; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
612; X86-NEXT:    movdqu (%ecx), %xmm0
613; X86-NEXT:    pcmpistri $24, (%eax), %xmm0
614; X86-NEXT:    setae %al
615; X86-NEXT:    retl
616;
617; X64-LABEL: pcmpistri_mem_eq_i8:
618; X64:       # %bb.0: # %entry
619; X64-NEXT:    movdqu (%rdi), %xmm0
620; X64-NEXT:    pcmpistri $24, (%rsi), %xmm0
621; X64-NEXT:    setae %al
622; X64-NEXT:    retq
623entry:
624  %lhs = load <16 x i8>, ptr %lhs_ptr, align 1
625  %rhs = load <16 x i8>, ptr %rhs_ptr, align 1
626  %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
627  %result = icmp eq i32 %c, 0
628  ret i1 %result
629}
630
631define i32 @pcmpistri_mem_idx_i8(ptr %lhs_ptr, ptr %rhs_ptr) nounwind {
632; X86-LABEL: pcmpistri_mem_idx_i8:
633; X86:       # %bb.0: # %entry
634; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
635; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
636; X86-NEXT:    movdqu (%ecx), %xmm0
637; X86-NEXT:    pcmpistri $24, (%eax), %xmm0
638; X86-NEXT:    movl %ecx, %eax
639; X86-NEXT:    retl
640;
641; X64-LABEL: pcmpistri_mem_idx_i8:
642; X64:       # %bb.0: # %entry
643; X64-NEXT:    movdqu (%rdi), %xmm0
644; X64-NEXT:    pcmpistri $24, (%rsi), %xmm0
645; X64-NEXT:    movl %ecx, %eax
646; X64-NEXT:    retq
647entry:
648  %lhs = load <16 x i8>, ptr %lhs_ptr, align 1
649  %rhs = load <16 x i8>, ptr %rhs_ptr, align 1
650  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
651  ret i32 %idx
652}
653
654define i32 @pcmpistri_mem_diff_i8(ptr %lhs_ptr, ptr %rhs_ptr) nounwind {
655; X86-LABEL: pcmpistri_mem_diff_i8:
656; X86:       # %bb.0: # %entry
657; X86-NEXT:    pushl %ebp
658; X86-NEXT:    movl %esp, %ebp
659; X86-NEXT:    andl $-16, %esp
660; X86-NEXT:    subl $48, %esp
661; X86-NEXT:    movl 12(%ebp), %eax
662; X86-NEXT:    movl 8(%ebp), %ecx
663; X86-NEXT:    movdqu (%ecx), %xmm1
664; X86-NEXT:    movdqu (%eax), %xmm0
665; X86-NEXT:    pcmpistri $24, %xmm0, %xmm1
666; X86-NEXT:    cmpl $16, %ecx
667; X86-NEXT:    jne .LBB17_2
668; X86-NEXT:  # %bb.1:
669; X86-NEXT:    xorl %eax, %eax
670; X86-NEXT:    jmp .LBB17_3
671; X86-NEXT:  .LBB17_2: # %compare
672; X86-NEXT:    movdqa %xmm1, (%esp)
673; X86-NEXT:    andl $15, %ecx
674; X86-NEXT:    movzbl (%esp,%ecx), %eax
675; X86-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
676; X86-NEXT:    subb 16(%esp,%ecx), %al
677; X86-NEXT:  .LBB17_3: # %exit
678; X86-NEXT:    movzbl %al, %eax
679; X86-NEXT:    movl %ebp, %esp
680; X86-NEXT:    popl %ebp
681; X86-NEXT:    retl
682;
683; X64-LABEL: pcmpistri_mem_diff_i8:
684; X64:       # %bb.0: # %entry
685; X64-NEXT:    movdqu (%rdi), %xmm1
686; X64-NEXT:    movdqu (%rsi), %xmm0
687; X64-NEXT:    pcmpistri $24, %xmm0, %xmm1
688; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
689; X64-NEXT:    cmpl $16, %ecx
690; X64-NEXT:    jne .LBB17_2
691; X64-NEXT:  # %bb.1:
692; X64-NEXT:    xorl %eax, %eax
693; X64-NEXT:    movzbl %al, %eax
694; X64-NEXT:    retq
695; X64-NEXT:  .LBB17_2: # %compare
696; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
697; X64-NEXT:    andl $15, %ecx
698; X64-NEXT:    movzbl -24(%rsp,%rcx), %eax
699; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
700; X64-NEXT:    subb -40(%rsp,%rcx), %al
701; X64-NEXT:    movzbl %al, %eax
702; X64-NEXT:    retq
703entry:
704  %lhs = load <16 x i8>, ptr %lhs_ptr, align 1
705  %rhs = load <16 x i8>, ptr %rhs_ptr, align 1
706  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
707  %eq = icmp eq i32 %idx, 16
708  br i1 %eq, label %exit, label %compare
709
710compare:
711  %lhs_c = extractelement <16 x i8> %lhs, i32 %idx
712  %rhs_c = extractelement <16 x i8> %rhs, i32 %idx
713  %sub = sub i8 %lhs_c, %rhs_c
714  br label %exit
715
716exit:
717  %result = phi i8 [ 0, %entry ], [ %sub, %compare ]
718  %result_ext = zext i8 %result to i32
719  ret i32 %result_ext
720}
721
722define i1 @pcmpistri_reg_eq_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind {
723; X86-LABEL: pcmpistri_reg_eq_i16:
724; X86:       # %bb.0: # %entry
725; X86-NEXT:    pcmpistri $24, %xmm1, %xmm0
726; X86-NEXT:    setae %al
727; X86-NEXT:    retl
728;
729; X64-LABEL: pcmpistri_reg_eq_i16:
730; X64:       # %bb.0: # %entry
731; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
732; X64-NEXT:    setae %al
733; X64-NEXT:    retq
734entry:
735  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
736  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
737  %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24)
738  %result = icmp eq i32 %c, 0
739  ret i1 %result
740}
741
742define i32 @pcmpistri_reg_idx_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind {
743; X86-LABEL: pcmpistri_reg_idx_i16:
744; X86:       # %bb.0: # %entry
745; X86-NEXT:    pcmpistri $24, %xmm1, %xmm0
746; X86-NEXT:    movl %ecx, %eax
747; X86-NEXT:    retl
748;
749; X64-LABEL: pcmpistri_reg_idx_i16:
750; X64:       # %bb.0: # %entry
751; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
752; X64-NEXT:    movl %ecx, %eax
753; X64-NEXT:    retq
754entry:
755  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
756  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
757  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24)
758  ret i32 %idx
759}
760
761define i32 @pcmpistri_reg_diff_i16(<8 x i16> %lhs, <8 x i16> %rhs) nounwind {
762; X86-LABEL: pcmpistri_reg_diff_i16:
763; X86:       # %bb.0: # %entry
764; X86-NEXT:    pcmpistri $24, %xmm1, %xmm0
765; X86-NEXT:    cmpl $16, %ecx
766; X86-NEXT:    jne .LBB20_2
767; X86-NEXT:  # %bb.1:
768; X86-NEXT:    xorl %eax, %eax
769; X86-NEXT:    movzwl %ax, %eax
770; X86-NEXT:    retl
771; X86-NEXT:  .LBB20_2: # %compare
772; X86-NEXT:    pushl %ebp
773; X86-NEXT:    movl %esp, %ebp
774; X86-NEXT:    andl $-16, %esp
775; X86-NEXT:    subl $48, %esp
776; X86-NEXT:    movdqa %xmm0, (%esp)
777; X86-NEXT:    addl %ecx, %ecx
778; X86-NEXT:    andl $14, %ecx
779; X86-NEXT:    movzwl (%esp,%ecx), %eax
780; X86-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp)
781; X86-NEXT:    subw 16(%esp,%ecx), %ax
782; X86-NEXT:    movl %ebp, %esp
783; X86-NEXT:    popl %ebp
784; X86-NEXT:    movzwl %ax, %eax
785; X86-NEXT:    retl
786;
787; X64-LABEL: pcmpistri_reg_diff_i16:
788; X64:       # %bb.0: # %entry
789; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
790; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
791; X64-NEXT:    cmpl $16, %ecx
792; X64-NEXT:    jne .LBB20_2
793; X64-NEXT:  # %bb.1:
794; X64-NEXT:    xorl %eax, %eax
795; X64-NEXT:    movzwl %ax, %eax
796; X64-NEXT:    retq
797; X64-NEXT:  .LBB20_2: # %compare
798; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
799; X64-NEXT:    andl $7, %ecx
800; X64-NEXT:    addl %ecx, %ecx
801; X64-NEXT:    movzwl -24(%rsp,%rcx), %eax
802; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
803; X64-NEXT:    subw -40(%rsp,%rcx), %ax
804; X64-NEXT:    movzwl %ax, %eax
805; X64-NEXT:    retq
806entry:
807  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
808  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
809  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 24)
810  %eq = icmp eq i32 %idx, 16
811  br i1 %eq, label %exit, label %compare
812
813compare:
814  %lhs_c = extractelement <8 x i16> %lhs, i32 %idx
815  %rhs_c = extractelement <8 x i16> %rhs, i32 %idx
816  %sub = sub i16 %lhs_c, %rhs_c
817  br label %exit
818
819exit:
820  %result = phi i16 [ 0, %entry ], [ %sub, %compare ]
821  %result_ext = zext i16 %result to i32
822  ret i32 %result_ext
823}
824
825define i1 @pcmpistri_mem_eq_i16(ptr %lhs_ptr, ptr %rhs_ptr) nounwind {
826; X86-LABEL: pcmpistri_mem_eq_i16:
827; X86:       # %bb.0: # %entry
828; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
829; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
830; X86-NEXT:    movdqu (%ecx), %xmm0
831; X86-NEXT:    pcmpistri $25, (%eax), %xmm0
832; X86-NEXT:    setae %al
833; X86-NEXT:    retl
834;
835; X64-LABEL: pcmpistri_mem_eq_i16:
836; X64:       # %bb.0: # %entry
837; X64-NEXT:    movdqu (%rdi), %xmm0
838; X64-NEXT:    pcmpistri $25, (%rsi), %xmm0
839; X64-NEXT:    setae %al
840; X64-NEXT:    retq
841entry:
842  %lhs = load <8 x i16>, ptr %lhs_ptr, align 1
843  %rhs = load <8 x i16>, ptr %rhs_ptr, align 1
844  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
845  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
846  %c = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25)
847  %result = icmp eq i32 %c, 0
848  ret i1 %result
849}
850
851define i32 @pcmpistri_mem_idx_i16(ptr %lhs_ptr, ptr %rhs_ptr) nounwind {
852; X86-LABEL: pcmpistri_mem_idx_i16:
853; X86:       # %bb.0: # %entry
854; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
855; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
856; X86-NEXT:    movdqu (%ecx), %xmm0
857; X86-NEXT:    pcmpistri $25, (%eax), %xmm0
858; X86-NEXT:    movl %ecx, %eax
859; X86-NEXT:    retl
860;
861; X64-LABEL: pcmpistri_mem_idx_i16:
862; X64:       # %bb.0: # %entry
863; X64-NEXT:    movdqu (%rdi), %xmm0
864; X64-NEXT:    pcmpistri $25, (%rsi), %xmm0
865; X64-NEXT:    movl %ecx, %eax
866; X64-NEXT:    retq
867entry:
868  %lhs = load <8 x i16>, ptr %lhs_ptr, align 1
869  %rhs = load <8 x i16>, ptr %rhs_ptr, align 1
870  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
871  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
872  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25)
873  ret i32 %idx
874}
875
876define i32 @pcmpistri_mem_diff_i16(ptr %lhs_ptr, ptr %rhs_ptr) nounwind {
877; X86-LABEL: pcmpistri_mem_diff_i16:
878; X86:       # %bb.0: # %entry
879; X86-NEXT:    pushl %ebp
880; X86-NEXT:    movl %esp, %ebp
881; X86-NEXT:    andl $-16, %esp
882; X86-NEXT:    subl $48, %esp
883; X86-NEXT:    movl 12(%ebp), %eax
884; X86-NEXT:    movl 8(%ebp), %ecx
885; X86-NEXT:    movdqu (%ecx), %xmm1
886; X86-NEXT:    movdqu (%eax), %xmm0
887; X86-NEXT:    pcmpistri $25, %xmm0, %xmm1
888; X86-NEXT:    cmpl $8, %ecx
889; X86-NEXT:    jne .LBB23_2
890; X86-NEXT:  # %bb.1:
891; X86-NEXT:    xorl %eax, %eax
892; X86-NEXT:    jmp .LBB23_3
893; X86-NEXT:  .LBB23_2: # %compare
894; X86-NEXT:    movdqa %xmm1, (%esp)
895; X86-NEXT:    addl %ecx, %ecx
896; X86-NEXT:    andl $14, %ecx
897; X86-NEXT:    movzwl (%esp,%ecx), %eax
898; X86-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
899; X86-NEXT:    subw 16(%esp,%ecx), %ax
900; X86-NEXT:  .LBB23_3: # %exit
901; X86-NEXT:    movzwl %ax, %eax
902; X86-NEXT:    movl %ebp, %esp
903; X86-NEXT:    popl %ebp
904; X86-NEXT:    retl
905;
906; X64-LABEL: pcmpistri_mem_diff_i16:
907; X64:       # %bb.0: # %entry
908; X64-NEXT:    movdqu (%rdi), %xmm1
909; X64-NEXT:    movdqu (%rsi), %xmm0
910; X64-NEXT:    pcmpistri $25, %xmm0, %xmm1
911; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
912; X64-NEXT:    cmpl $8, %ecx
913; X64-NEXT:    jne .LBB23_2
914; X64-NEXT:  # %bb.1:
915; X64-NEXT:    xorl %eax, %eax
916; X64-NEXT:    movzwl %ax, %eax
917; X64-NEXT:    retq
918; X64-NEXT:  .LBB23_2: # %compare
919; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
920; X64-NEXT:    andl $7, %ecx
921; X64-NEXT:    addl %ecx, %ecx
922; X64-NEXT:    movzwl -24(%rsp,%rcx), %eax
923; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
924; X64-NEXT:    subw -40(%rsp,%rcx), %ax
925; X64-NEXT:    movzwl %ax, %eax
926; X64-NEXT:    retq
927entry:
928  %lhs = load <8 x i16>, ptr %lhs_ptr, align 1
929  %rhs = load <8 x i16>, ptr %rhs_ptr, align 1
930  %lhs_cast = bitcast <8 x i16> %lhs to <16 x i8>
931  %rhs_cast = bitcast <8 x i16> %rhs to <16 x i8>
932  %idx = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs_cast, <16 x i8> %rhs_cast, i8 25)
933  %eq = icmp eq i32 %idx, 8
934  br i1 %eq, label %exit, label %compare
935
936compare:
937  %lhs_c = extractelement <8 x i16> %lhs, i32 %idx
938  %rhs_c = extractelement <8 x i16> %rhs, i32 %idx
939  %sub = sub i16 %lhs_c, %rhs_c
940  br label %exit
941
942exit:
943  %result = phi i16 [ 0, %entry ], [ %sub, %compare ]
944  %result_ext = zext i16 %result to i32
945  ret i32 %result_ext
946}
947
948define void @pcmpestr_index_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, ptr %iptr, ptr %fptr) nounwind {
949; X86-LABEL: pcmpestr_index_flag:
950; X86:       # %bb.0: # %entry
951; X86-NEXT:    pushl %ebx
952; X86-NEXT:    pushl %edi
953; X86-NEXT:    pushl %esi
954; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
955; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
956; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
957; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
958; X86-NEXT:    xorl %ebx, %ebx
959; X86-NEXT:    pcmpestri $24, %xmm1, %xmm0
960; X86-NEXT:    setb %bl
961; X86-NEXT:    movl %ecx, (%edi)
962; X86-NEXT:    movl %ebx, (%esi)
963; X86-NEXT:    popl %esi
964; X86-NEXT:    popl %edi
965; X86-NEXT:    popl %ebx
966; X86-NEXT:    retl
967;
968; X64-LABEL: pcmpestr_index_flag:
969; X64:       # %bb.0: # %entry
970; X64-NEXT:    movq %rcx, %r8
971; X64-NEXT:    movq %rdx, %r9
972; X64-NEXT:    movl %esi, %edx
973; X64-NEXT:    movl %edi, %eax
974; X64-NEXT:    xorl %esi, %esi
975; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
976; X64-NEXT:    setb %sil
977; X64-NEXT:    movl %ecx, (%r9)
978; X64-NEXT:    movl %esi, (%r8)
979; X64-NEXT:    retq
980entry:
981  %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
982  %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
983  store i32 %index, ptr %iptr
984  store i32 %flag, ptr %fptr
985  ret void
986}
987
988define void @pcmpestr_mask_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, ptr %mptr, ptr %fptr) nounwind {
989; X86-LABEL: pcmpestr_mask_flag:
990; X86:       # %bb.0: # %entry
991; X86-NEXT:    pushl %ebx
992; X86-NEXT:    pushl %esi
993; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
994; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
995; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
996; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
997; X86-NEXT:    xorl %ebx, %ebx
998; X86-NEXT:    pcmpestrm $24, %xmm1, %xmm0
999; X86-NEXT:    setb %bl
1000; X86-NEXT:    movdqa %xmm0, (%esi)
1001; X86-NEXT:    movl %ebx, (%ecx)
1002; X86-NEXT:    popl %esi
1003; X86-NEXT:    popl %ebx
1004; X86-NEXT:    retl
1005;
1006; X64-LABEL: pcmpestr_mask_flag:
1007; X64:       # %bb.0: # %entry
1008; X64-NEXT:    movq %rdx, %r8
1009; X64-NEXT:    movl %esi, %edx
1010; X64-NEXT:    movl %edi, %eax
1011; X64-NEXT:    xorl %esi, %esi
1012; X64-NEXT:    pcmpestrm $24, %xmm1, %xmm0
1013; X64-NEXT:    setb %sil
1014; X64-NEXT:    movdqa %xmm0, (%r8)
1015; X64-NEXT:    movl %esi, (%rcx)
1016; X64-NEXT:    retq
1017entry:
1018  %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1019  %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1020  store <16 x i8> %mask, ptr %mptr
1021  store i32 %flag, ptr %fptr
1022  ret void
1023}
1024
1025define void @pcmpestr_mask_index(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, ptr %mptr, ptr %iptr) nounwind {
1026; X86-LABEL: pcmpestr_mask_index:
1027; X86:       # %bb.0: # %entry
1028; X86-NEXT:    pushl %edi
1029; X86-NEXT:    pushl %esi
1030; X86-NEXT:    movdqa %xmm0, %xmm2
1031; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1032; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
1033; X86-NEXT:    pcmpestrm $24, %xmm1, %xmm0
1034; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
1035; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
1036; X86-NEXT:    pcmpestri $24, %xmm1, %xmm2
1037; X86-NEXT:    movdqa %xmm0, (%edi)
1038; X86-NEXT:    movl %ecx, (%esi)
1039; X86-NEXT:    popl %esi
1040; X86-NEXT:    popl %edi
1041; X86-NEXT:    retl
1042;
1043; X64-LABEL: pcmpestr_mask_index:
1044; X64:       # %bb.0: # %entry
1045; X64-NEXT:    movq %rcx, %r8
1046; X64-NEXT:    movq %rdx, %r9
1047; X64-NEXT:    movl %esi, %edx
1048; X64-NEXT:    movl %edi, %eax
1049; X64-NEXT:    movdqa %xmm0, %xmm2
1050; X64-NEXT:    pcmpestrm $24, %xmm1, %xmm0
1051; X64-NEXT:    pcmpestri $24, %xmm1, %xmm2
1052; X64-NEXT:    movdqa %xmm0, (%r9)
1053; X64-NEXT:    movl %ecx, (%r8)
1054; X64-NEXT:    retq
1055entry:
1056  %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1057  %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1058  store <16 x i8> %mask, ptr %mptr
1059  store i32 %index, ptr %iptr
1060  ret void
1061}
1062
1063define void @pcmpestr_mask_index_flag(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, ptr %mptr, ptr %iptr, ptr %fptr) nounwind {
1064; X86-LABEL: pcmpestr_mask_index_flag:
1065; X86:       # %bb.0: # %entry
1066; X86-NEXT:    pushl %ebp
1067; X86-NEXT:    pushl %ebx
1068; X86-NEXT:    pushl %edi
1069; X86-NEXT:    pushl %esi
1070; X86-NEXT:    movdqa %xmm0, %xmm2
1071; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1072; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
1073; X86-NEXT:    pcmpestrm $24, %xmm1, %xmm0
1074; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
1075; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
1076; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
1077; X86-NEXT:    xorl %ebx, %ebx
1078; X86-NEXT:    pcmpestri $24, %xmm1, %xmm2
1079; X86-NEXT:    setb %bl
1080; X86-NEXT:    movdqa %xmm0, (%ebp)
1081; X86-NEXT:    movl %ecx, (%edi)
1082; X86-NEXT:    movl %ebx, (%esi)
1083; X86-NEXT:    popl %esi
1084; X86-NEXT:    popl %edi
1085; X86-NEXT:    popl %ebx
1086; X86-NEXT:    popl %ebp
1087; X86-NEXT:    retl
1088;
1089; X64-LABEL: pcmpestr_mask_index_flag:
1090; X64:       # %bb.0: # %entry
1091; X64-NEXT:    movq %rcx, %r9
1092; X64-NEXT:    movq %rdx, %r10
1093; X64-NEXT:    movl %esi, %edx
1094; X64-NEXT:    movl %edi, %eax
1095; X64-NEXT:    movdqa %xmm0, %xmm2
1096; X64-NEXT:    pcmpestrm $24, %xmm1, %xmm0
1097; X64-NEXT:    xorl %esi, %esi
1098; X64-NEXT:    pcmpestri $24, %xmm1, %xmm2
1099; X64-NEXT:    setb %sil
1100; X64-NEXT:    movdqa %xmm0, (%r10)
1101; X64-NEXT:    movl %ecx, (%r9)
1102; X64-NEXT:    movl %esi, (%r8)
1103; X64-NEXT:    retq
1104entry:
1105  %index = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1106  %mask = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1107  %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1108  store <16 x i8> %mask, ptr %mptr
1109  store i32 %index, ptr %iptr
1110  store i32 %flag, ptr %fptr
1111  ret void
1112}
1113
1114define void @pcmpistr_index_flag(<16 x i8> %lhs, <16 x i8> %rhs, ptr %iptr, ptr %fptr) nounwind {
1115; X86-LABEL: pcmpistr_index_flag:
1116; X86:       # %bb.0: # %entry
1117; X86-NEXT:    pushl %esi
1118; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
1119; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
1120; X86-NEXT:    xorl %eax, %eax
1121; X86-NEXT:    pcmpistri $24, %xmm1, %xmm0
1122; X86-NEXT:    setb %al
1123; X86-NEXT:    movl %ecx, (%esi)
1124; X86-NEXT:    movl %eax, (%edx)
1125; X86-NEXT:    popl %esi
1126; X86-NEXT:    retl
1127;
1128; X64-LABEL: pcmpistr_index_flag:
1129; X64:       # %bb.0: # %entry
1130; X64-NEXT:    xorl %eax, %eax
1131; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
1132; X64-NEXT:    setb %al
1133; X64-NEXT:    movl %ecx, (%rdi)
1134; X64-NEXT:    movl %eax, (%rsi)
1135; X64-NEXT:    retq
1136entry:
1137  %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1138  %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1139  store i32 %index, ptr %iptr
1140  store i32 %flag, ptr %fptr
1141  ret void
1142}
1143
1144define void @pcmpistr_mask_flag(<16 x i8> %lhs, <16 x i8> %rhs, ptr %mptr, ptr %fptr) nounwind {
1145; X86-LABEL: pcmpistr_mask_flag:
1146; X86:       # %bb.0: # %entry
1147; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1148; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
1149; X86-NEXT:    xorl %eax, %eax
1150; X86-NEXT:    pcmpistrm $24, %xmm1, %xmm0
1151; X86-NEXT:    setb %al
1152; X86-NEXT:    movdqa %xmm0, (%edx)
1153; X86-NEXT:    movl %eax, (%ecx)
1154; X86-NEXT:    retl
1155;
1156; X64-LABEL: pcmpistr_mask_flag:
1157; X64:       # %bb.0: # %entry
1158; X64-NEXT:    xorl %eax, %eax
1159; X64-NEXT:    pcmpistrm $24, %xmm1, %xmm0
1160; X64-NEXT:    setb %al
1161; X64-NEXT:    movdqa %xmm0, (%rdi)
1162; X64-NEXT:    movl %eax, (%rsi)
1163; X64-NEXT:    retq
1164entry:
1165  %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1166  %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1167  store <16 x i8> %mask, ptr %mptr
1168  store i32 %flag, ptr %fptr
1169  ret void
1170}
1171
1172define void @pcmpistr_mask_index(<16 x i8> %lhs, <16 x i8> %rhs, ptr %mptr, ptr %iptr) nounwind {
1173; X86-LABEL: pcmpistr_mask_index:
1174; X86:       # %bb.0: # %entry
1175; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1176; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
1177; X86-NEXT:    pcmpistri $24, %xmm1, %xmm0
1178; X86-NEXT:    pcmpistrm $24, %xmm1, %xmm0
1179; X86-NEXT:    movdqa %xmm0, (%edx)
1180; X86-NEXT:    movl %ecx, (%eax)
1181; X86-NEXT:    retl
1182;
1183; X64-LABEL: pcmpistr_mask_index:
1184; X64:       # %bb.0: # %entry
1185; X64-NEXT:    pcmpistri $24, %xmm1, %xmm0
1186; X64-NEXT:    pcmpistrm $24, %xmm1, %xmm0
1187; X64-NEXT:    movdqa %xmm0, (%rdi)
1188; X64-NEXT:    movl %ecx, (%rsi)
1189; X64-NEXT:    retq
1190entry:
1191  %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1192  %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1193  store <16 x i8> %mask, ptr %mptr
1194  store i32 %index, ptr %iptr
1195  ret void
1196}
1197
1198define void @pcmpistr_mask_index_flag(<16 x i8> %lhs, <16 x i8> %rhs, ptr %mptr, ptr %iptr, ptr %fptr) nounwind {
1199; X86-LABEL: pcmpistr_mask_index_flag:
1200; X86:       # %bb.0: # %entry
1201; X86-NEXT:    pushl %ebx
1202; X86-NEXT:    pushl %esi
1203; X86-NEXT:    movdqa %xmm0, %xmm2
1204; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1205; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
1206; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
1207; X86-NEXT:    pcmpistrm $24, %xmm1, %xmm0
1208; X86-NEXT:    xorl %ebx, %ebx
1209; X86-NEXT:    pcmpistri $24, %xmm1, %xmm2
1210; X86-NEXT:    setb %bl
1211; X86-NEXT:    movdqa %xmm0, (%esi)
1212; X86-NEXT:    movl %ecx, (%edx)
1213; X86-NEXT:    movl %ebx, (%eax)
1214; X86-NEXT:    popl %esi
1215; X86-NEXT:    popl %ebx
1216; X86-NEXT:    retl
1217;
1218; X64-LABEL: pcmpistr_mask_index_flag:
1219; X64:       # %bb.0: # %entry
1220; X64-NEXT:    movdqa %xmm0, %xmm2
1221; X64-NEXT:    pcmpistrm $24, %xmm1, %xmm0
1222; X64-NEXT:    xorl %eax, %eax
1223; X64-NEXT:    pcmpistri $24, %xmm1, %xmm2
1224; X64-NEXT:    setb %al
1225; X64-NEXT:    movdqa %xmm0, (%rdi)
1226; X64-NEXT:    movl %ecx, (%rsi)
1227; X64-NEXT:    movl %eax, (%rdx)
1228; X64-NEXT:    retq
1229entry:
1230  %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1231  %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1232  %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1233  store <16 x i8> %mask, ptr %mptr
1234  store i32 %index, ptr %iptr
1235  store i32 %flag, ptr %fptr
1236  ret void
1237}
1238
1239; Make sure we don't fold loads when we need to emit pcmpistrm and pcmpistri.
1240define void @pcmpistr_mask_index_flag_load(<16 x i8> %lhs, ptr %rhsptr, ptr %mptr, ptr %iptr, ptr %fptr) nounwind {
1241; X86-LABEL: pcmpistr_mask_index_flag_load:
1242; X86:       # %bb.0: # %entry
1243; X86-NEXT:    pushl %ebx
1244; X86-NEXT:    pushl %esi
1245; X86-NEXT:    movdqa %xmm0, %xmm1
1246; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1247; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
1248; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
1249; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1250; X86-NEXT:    movdqu (%ecx), %xmm2
1251; X86-NEXT:    pcmpistrm $24, %xmm2, %xmm0
1252; X86-NEXT:    xorl %ebx, %ebx
1253; X86-NEXT:    pcmpistri $24, %xmm2, %xmm1
1254; X86-NEXT:    setb %bl
1255; X86-NEXT:    movdqa %xmm0, (%esi)
1256; X86-NEXT:    movl %ecx, (%edx)
1257; X86-NEXT:    movl %ebx, (%eax)
1258; X86-NEXT:    popl %esi
1259; X86-NEXT:    popl %ebx
1260; X86-NEXT:    retl
1261;
1262; X64-LABEL: pcmpistr_mask_index_flag_load:
1263; X64:       # %bb.0: # %entry
1264; X64-NEXT:    movq %rcx, %rax
1265; X64-NEXT:    movdqa %xmm0, %xmm1
1266; X64-NEXT:    movdqu (%rdi), %xmm2
1267; X64-NEXT:    pcmpistrm $24, %xmm2, %xmm0
1268; X64-NEXT:    xorl %edi, %edi
1269; X64-NEXT:    pcmpistri $24, %xmm2, %xmm1
1270; X64-NEXT:    setb %dil
1271; X64-NEXT:    movdqa %xmm0, (%rsi)
1272; X64-NEXT:    movl %ecx, (%rdx)
1273; X64-NEXT:    movl %edi, (%rax)
1274; X64-NEXT:    retq
1275entry:
1276  %rhs = load <16 x i8>, ptr %rhsptr, align 1
1277  %index = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1278  %mask = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1279  %flag = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %lhs, <16 x i8> %rhs, i8 24)
1280  store <16 x i8> %mask, ptr %mptr
1281  store i32 %index, ptr %iptr
1282  store i32 %flag, ptr %fptr
1283  ret void
1284}
1285
1286; Make sure we don't fold nontemporal loads.
1287define i32 @pcmpestri_nontemporal(<16 x i8> %lhs, i32 %lhs_len, ptr %rhsptr, i32 %rhs_len) nounwind {
1288; X86-LABEL: pcmpestri_nontemporal:
1289; X86:       # %bb.0: # %entry
1290; X86-NEXT:    pushl %ebx
1291; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1292; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
1293; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1294; X86-NEXT:    movntdqa (%ecx), %xmm1
1295; X86-NEXT:    xorl %ebx, %ebx
1296; X86-NEXT:    pcmpestri $24, %xmm1, %xmm0
1297; X86-NEXT:    setb %bl
1298; X86-NEXT:    movl %ebx, %eax
1299; X86-NEXT:    popl %ebx
1300; X86-NEXT:    retl
1301;
1302; X64-LABEL: pcmpestri_nontemporal:
1303; X64:       # %bb.0: # %entry
1304; X64-NEXT:    movl %edi, %eax
1305; X64-NEXT:    movntdqa (%rsi), %xmm1
1306; X64-NEXT:    xorl %esi, %esi
1307; X64-NEXT:    pcmpestri $24, %xmm1, %xmm0
1308; X64-NEXT:    setb %sil
1309; X64-NEXT:    movl %esi, %eax
1310; X64-NEXT:    retq
1311entry:
1312  %rhs = load <16 x i8>, ptr %rhsptr, align 16, !nontemporal !0
1313  %flag = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %lhs, i32 %lhs_len, <16 x i8> %rhs, i32 %rhs_len, i8 24)
1314  ret i32 %flag
1315}
1316
1317!0 = !{ i32 1 }
1318