xref: /llvm-project/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll (revision e6bf48d11047e970cb24554a01b65b566d6b5d22)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown   -mattr=+sse,+sse2 | FileCheck %s --check-prefix=X86
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse,+sse2 | FileCheck %s --check-prefix=X64
4
5; If the target does not have a single div/rem operation,
6; -div-rem-pairs pass will decompose the remainder calculation as:
7;   X % Y --> X - ((X / Y) * Y)
8; But if the target does have a single div/rem operation,
9; the opposite transform is likely beneficial.
10
11define i8 @scalar_i8(i8 %x, i8 %y, ptr %divdst) nounwind {
12; X86-LABEL: scalar_i8:
13; X86:       # %bb.0:
14; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
15; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
16; X86-NEXT:    movb {{[0-9]+}}(%esp), %cl
17; X86-NEXT:    movsbl %cl, %eax
18; X86-NEXT:    idivb %ch
19; X86-NEXT:    movb %al, (%edx)
20; X86-NEXT:    mulb %ch
21; X86-NEXT:    subb %al, %cl
22; X86-NEXT:    movl %ecx, %eax
23; X86-NEXT:    retl
24;
25; X64-LABEL: scalar_i8:
26; X64:       # %bb.0:
27; X64-NEXT:    movsbl %dil, %ecx
28; X64-NEXT:    movl %ecx, %eax
29; X64-NEXT:    idivb %sil
30; X64-NEXT:    movb %al, (%rdx)
31; X64-NEXT:    mulb %sil
32; X64-NEXT:    subb %al, %cl
33; X64-NEXT:    movl %ecx, %eax
34; X64-NEXT:    retq
35  %div = sdiv i8 %x, %y
36  store i8 %div, ptr %divdst, align 4
37  %t1 = mul i8 %div, %y
38  %t2 = sub i8 %x, %t1
39  ret i8 %t2
40}
41
42define i16 @scalar_i16(i16 %x, i16 %y, ptr %divdst) nounwind {
43; X86-LABEL: scalar_i16:
44; X86:       # %bb.0:
45; X86-NEXT:    pushl %edi
46; X86-NEXT:    pushl %esi
47; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
48; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
49; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
50; X86-NEXT:    movl %ecx, %eax
51; X86-NEXT:    cwtd
52; X86-NEXT:    idivw %si
53; X86-NEXT:    # kill: def $ax killed $ax def $eax
54; X86-NEXT:    movw %ax, (%edi)
55; X86-NEXT:    imull %eax, %esi
56; X86-NEXT:    subl %esi, %ecx
57; X86-NEXT:    movl %ecx, %eax
58; X86-NEXT:    popl %esi
59; X86-NEXT:    popl %edi
60; X86-NEXT:    retl
61;
62; X64-LABEL: scalar_i16:
63; X64:       # %bb.0:
64; X64-NEXT:    movq %rdx, %rcx
65; X64-NEXT:    movl %edi, %eax
66; X64-NEXT:    cwtd
67; X64-NEXT:    idivw %si
68; X64-NEXT:    # kill: def $ax killed $ax def $eax
69; X64-NEXT:    movw %ax, (%rcx)
70; X64-NEXT:    imull %eax, %esi
71; X64-NEXT:    subl %esi, %edi
72; X64-NEXT:    movl %edi, %eax
73; X64-NEXT:    retq
74  %div = sdiv i16 %x, %y
75  store i16 %div, ptr %divdst, align 4
76  %t1 = mul i16 %div, %y
77  %t2 = sub i16 %x, %t1
78  ret i16 %t2
79}
80
81define i32 @scalar_i32(i32 %x, i32 %y, ptr %divdst) nounwind {
82; X86-LABEL: scalar_i32:
83; X86:       # %bb.0:
84; X86-NEXT:    pushl %edi
85; X86-NEXT:    pushl %esi
86; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
87; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
88; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
89; X86-NEXT:    movl %ecx, %eax
90; X86-NEXT:    cltd
91; X86-NEXT:    idivl %edi
92; X86-NEXT:    movl %eax, (%esi)
93; X86-NEXT:    imull %edi, %eax
94; X86-NEXT:    subl %eax, %ecx
95; X86-NEXT:    movl %ecx, %eax
96; X86-NEXT:    popl %esi
97; X86-NEXT:    popl %edi
98; X86-NEXT:    retl
99;
100; X64-LABEL: scalar_i32:
101; X64:       # %bb.0:
102; X64-NEXT:    movq %rdx, %rcx
103; X64-NEXT:    movl %edi, %eax
104; X64-NEXT:    cltd
105; X64-NEXT:    idivl %esi
106; X64-NEXT:    movl %eax, (%rcx)
107; X64-NEXT:    imull %esi, %eax
108; X64-NEXT:    subl %eax, %edi
109; X64-NEXT:    movl %edi, %eax
110; X64-NEXT:    retq
111  %div = sdiv i32 %x, %y
112  store i32 %div, ptr %divdst, align 4
113  %t1 = mul i32 %div, %y
114  %t2 = sub i32 %x, %t1
115  ret i32 %t2
116}
117
118define i64 @scalar_i64(i64 %x, i64 %y, ptr %divdst) nounwind {
119; X86-LABEL: scalar_i64:
120; X86:       # %bb.0:
121; X86-NEXT:    pushl %ebp
122; X86-NEXT:    pushl %ebx
123; X86-NEXT:    pushl %edi
124; X86-NEXT:    pushl %esi
125; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
126; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
127; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
128; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
129; X86-NEXT:    pushl %ebp
130; X86-NEXT:    pushl %ebx
131; X86-NEXT:    pushl %edi
132; X86-NEXT:    pushl %esi
133; X86-NEXT:    calll __divdi3
134; X86-NEXT:    addl $16, %esp
135; X86-NEXT:    movl %edx, %ecx
136; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
137; X86-NEXT:    movl %ecx, 4(%edx)
138; X86-NEXT:    movl %eax, (%edx)
139; X86-NEXT:    imull %eax, %ebp
140; X86-NEXT:    mull %ebx
141; X86-NEXT:    addl %ebp, %edx
142; X86-NEXT:    imull %ebx, %ecx
143; X86-NEXT:    addl %edx, %ecx
144; X86-NEXT:    subl %eax, %esi
145; X86-NEXT:    sbbl %ecx, %edi
146; X86-NEXT:    movl %esi, %eax
147; X86-NEXT:    movl %edi, %edx
148; X86-NEXT:    popl %esi
149; X86-NEXT:    popl %edi
150; X86-NEXT:    popl %ebx
151; X86-NEXT:    popl %ebp
152; X86-NEXT:    retl
153;
154; X64-LABEL: scalar_i64:
155; X64:       # %bb.0:
156; X64-NEXT:    movq %rdx, %rcx
157; X64-NEXT:    movq %rdi, %rax
158; X64-NEXT:    cqto
159; X64-NEXT:    idivq %rsi
160; X64-NEXT:    movq %rax, (%rcx)
161; X64-NEXT:    imulq %rsi, %rax
162; X64-NEXT:    subq %rax, %rdi
163; X64-NEXT:    movq %rdi, %rax
164; X64-NEXT:    retq
165  %div = sdiv i64 %x, %y
166  store i64 %div, ptr %divdst, align 4
167  %t1 = mul i64 %div, %y
168  %t2 = sub i64 %x, %t1
169  ret i64 %t2
170}
171
172; X86 doesn't have __divti3, so the urem is expanded into a loop.
173define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
174; X86-LABEL: scalar_i128:
175; X86:       # %bb.0: # %_udiv-special-cases
176; X86-NEXT:    pushl %ebp
177; X86-NEXT:    movl %esp, %ebp
178; X86-NEXT:    pushl %ebx
179; X86-NEXT:    pushl %edi
180; X86-NEXT:    pushl %esi
181; X86-NEXT:    andl $-16, %esp
182; X86-NEXT:    subl $176, %esp
183; X86-NEXT:    movl 20(%ebp), %edx
184; X86-NEXT:    movl 24(%ebp), %ecx
185; X86-NEXT:    movl %ecx, %eax
186; X86-NEXT:    sarl $31, %eax
187; X86-NEXT:    xorl %eax, %ecx
188; X86-NEXT:    movl %ecx, %edi
189; X86-NEXT:    xorl %eax, %edx
190; X86-NEXT:    movl %edx, %esi
191; X86-NEXT:    movl 16(%ebp), %edx
192; X86-NEXT:    xorl %eax, %edx
193; X86-NEXT:    movl 12(%ebp), %ecx
194; X86-NEXT:    xorl %eax, %ecx
195; X86-NEXT:    subl %eax, %ecx
196; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
197; X86-NEXT:    sbbl %eax, %edx
198; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
199; X86-NEXT:    sbbl %eax, %esi
200; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
201; X86-NEXT:    sbbl %eax, %edi
202; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
203; X86-NEXT:    movl 40(%ebp), %ecx
204; X86-NEXT:    movl %ecx, %edx
205; X86-NEXT:    sarl $31, %edx
206; X86-NEXT:    movl %ecx, %esi
207; X86-NEXT:    xorl %edx, %esi
208; X86-NEXT:    movl 36(%ebp), %ecx
209; X86-NEXT:    xorl %edx, %ecx
210; X86-NEXT:    movl 32(%ebp), %ebx
211; X86-NEXT:    xorl %edx, %ebx
212; X86-NEXT:    movl 28(%ebp), %edi
213; X86-NEXT:    xorl %edx, %edi
214; X86-NEXT:    subl %edx, %edi
215; X86-NEXT:    sbbl %edx, %ebx
216; X86-NEXT:    sbbl %edx, %ecx
217; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
218; X86-NEXT:    sbbl %edx, %esi
219; X86-NEXT:    xorl %eax, %edx
220; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
221; X86-NEXT:    movl %ebx, %eax
222; X86-NEXT:    orl %esi, %eax
223; X86-NEXT:    movl %edi, %ecx
224; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
225; X86-NEXT:    orl %eax, %ecx
226; X86-NEXT:    sete %cl
227; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
228; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
229; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
230; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
231; X86-NEXT:    orl %eax, %edx
232; X86-NEXT:    sete %al
233; X86-NEXT:    orb %cl, %al
234; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
235; X86-NEXT:    bsrl %esi, %edx
236; X86-NEXT:    xorl $31, %edx
237; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
238; X86-NEXT:    bsrl %eax, %ecx
239; X86-NEXT:    xorl $31, %ecx
240; X86-NEXT:    orl $32, %ecx
241; X86-NEXT:    testl %esi, %esi
242; X86-NEXT:    cmovnel %edx, %ecx
243; X86-NEXT:    bsrl %ebx, %edx
244; X86-NEXT:    xorl $31, %edx
245; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
246; X86-NEXT:    bsrl %edi, %edi
247; X86-NEXT:    xorl $31, %edi
248; X86-NEXT:    orl $32, %edi
249; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
250; X86-NEXT:    testl %ebx, %ebx
251; X86-NEXT:    cmovnel %edx, %edi
252; X86-NEXT:    orl $64, %edi
253; X86-NEXT:    movl %eax, %edx
254; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
255; X86-NEXT:    orl %esi, %edx
256; X86-NEXT:    cmovnel %ecx, %edi
257; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
258; X86-NEXT:    bsrl %eax, %edx
259; X86-NEXT:    xorl $31, %edx
260; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
261; X86-NEXT:    xorl $31, %ecx
262; X86-NEXT:    orl $32, %ecx
263; X86-NEXT:    testl %eax, %eax
264; X86-NEXT:    cmovnel %edx, %ecx
265; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
266; X86-NEXT:    bsrl %ebx, %esi
267; X86-NEXT:    xorl $31, %esi
268; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
269; X86-NEXT:    xorl $31, %edx
270; X86-NEXT:    orl $32, %edx
271; X86-NEXT:    testl %ebx, %ebx
272; X86-NEXT:    cmovnel %esi, %edx
273; X86-NEXT:    orl $64, %edx
274; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
275; X86-NEXT:    orl %eax, %esi
276; X86-NEXT:    cmovnel %ecx, %edx
277; X86-NEXT:    xorl %ebx, %ebx
278; X86-NEXT:    subl %edx, %edi
279; X86-NEXT:    movl $0, %edx
280; X86-NEXT:    sbbl %edx, %edx
281; X86-NEXT:    movl $0, %esi
282; X86-NEXT:    sbbl %esi, %esi
283; X86-NEXT:    movl $0, %eax
284; X86-NEXT:    sbbl %eax, %eax
285; X86-NEXT:    movl $127, %ecx
286; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
287; X86-NEXT:    cmpl %edi, %ecx
288; X86-NEXT:    movl $0, %ecx
289; X86-NEXT:    sbbl %edx, %ecx
290; X86-NEXT:    movl $0, %ecx
291; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
292; X86-NEXT:    sbbl %esi, %ecx
293; X86-NEXT:    movl $0, %ecx
294; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
295; X86-NEXT:    sbbl %eax, %ecx
296; X86-NEXT:    setb %cl
297; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
298; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
299; X86-NEXT:    cmovnel %ebx, %esi
300; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
301; X86-NEXT:    cmovnel %ebx, %eax
302; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
303; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
304; X86-NEXT:    cmovnel %ebx, %eax
305; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
306; X86-NEXT:    jne .LBB4_1
307; X86-NEXT:  # %bb.8: # %_udiv-special-cases
308; X86-NEXT:    movl %edx, %edi
309; X86-NEXT:    movl %eax, %edx
310; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
311; X86-NEXT:    xorl $127, %eax
312; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
313; X86-NEXT:    movl %edi, %ecx
314; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
315; X86-NEXT:    orl %eax, %ecx
316; X86-NEXT:    movl %edx, %eax
317; X86-NEXT:    movl %ebx, %edx
318; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
319; X86-NEXT:    je .LBB4_9
320; X86-NEXT:  # %bb.5: # %udiv-bb1
321; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
322; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
323; X86-NEXT:    xorps %xmm0, %xmm0
324; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
325; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
326; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
327; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
328; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
329; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
330; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
331; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
332; X86-NEXT:    movl %ebx, %ecx
333; X86-NEXT:    xorb $127, %cl
334; X86-NEXT:    movl %ecx, %eax
335; X86-NEXT:    shrb $3, %al
336; X86-NEXT:    andb $12, %al
337; X86-NEXT:    negb %al
338; X86-NEXT:    movsbl %al, %eax
339; X86-NEXT:    movl 152(%esp,%eax), %esi
340; X86-NEXT:    movl 156(%esp,%eax), %edx
341; X86-NEXT:    shldl %cl, %esi, %edx
342; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
343; X86-NEXT:    movl 144(%esp,%eax), %edx
344; X86-NEXT:    movl 148(%esp,%eax), %eax
345; X86-NEXT:    shldl %cl, %eax, %esi
346; X86-NEXT:    shldl %cl, %edx, %eax
347; X86-NEXT:    shll %cl, %edx
348; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
349; X86-NEXT:    addl $1, %ebx
350; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
351; X86-NEXT:    adcl $0, %edi
352; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
353; X86-NEXT:    adcl $0, %ecx
354; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
355; X86-NEXT:    adcl $0, %edx
356; X86-NEXT:    jae .LBB4_2
357; X86-NEXT:  # %bb.6:
358; X86-NEXT:    xorl %ecx, %ecx
359; X86-NEXT:    xorl %edx, %edx
360; X86-NEXT:    movl %esi, %ebx
361; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
362; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
363; X86-NEXT:    jmp .LBB4_7
364; X86-NEXT:  .LBB4_1:
365; X86-NEXT:    movl %ebx, %edx
366; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
367; X86-NEXT:    jmp .LBB4_9
368; X86-NEXT:  .LBB4_2: # %udiv-preheader
369; X86-NEXT:    movl %edi, %ebx
370; X86-NEXT:    movl %edx, %edi
371; X86-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
372; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
373; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
374; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
375; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
376; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
377; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
378; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
379; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
380; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
381; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
382; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
383; X86-NEXT:    movl %ecx, %eax
384; X86-NEXT:    shrb $3, %al
385; X86-NEXT:    andb $12, %al
386; X86-NEXT:    movzbl %al, %eax
387; X86-NEXT:    movl 108(%esp,%eax), %edx
388; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
389; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
390; X86-NEXT:    movl 104(%esp,%eax), %ebx
391; X86-NEXT:    movl %ebx, %esi
392; X86-NEXT:    shrdl %cl, %edx, %esi
393; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
394; X86-NEXT:    movl 96(%esp,%eax), %esi
395; X86-NEXT:    movl 100(%esp,%eax), %eax
396; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
397; X86-NEXT:    movl %eax, %edi
398; X86-NEXT:    shrdl %cl, %ebx, %edi
399; X86-NEXT:    movl %edi, %ebx
400; X86-NEXT:    shrl %cl, %edx
401; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
402; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
403; X86-NEXT:    shrdl %cl, %eax, %esi
404; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
405; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
406; X86-NEXT:    addl $-1, %eax
407; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
408; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
409; X86-NEXT:    adcl $-1, %eax
410; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
411; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
412; X86-NEXT:    adcl $-1, %eax
413; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
414; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
415; X86-NEXT:    adcl $-1, %ecx
416; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
417; X86-NEXT:    xorl %edx, %edx
418; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
419; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
420; X86-NEXT:    .p2align 4
421; X86-NEXT:  .LBB4_3: # %udiv-do-while
422; X86-NEXT:    # =>This Inner Loop Header: Depth=1
423; X86-NEXT:    movl %edx, %esi
424; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
425; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
426; X86-NEXT:    shldl $1, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
427; X86-NEXT:    movl %ebx, %edx
428; X86-NEXT:    shldl $1, %ebx, %eax
429; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
430; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
431; X86-NEXT:    shldl $1, %ebx, %edx
432; X86-NEXT:    shldl $1, %ecx, %ebx
433; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
434; X86-NEXT:    shldl $1, %edi, %ecx
435; X86-NEXT:    orl %esi, %ecx
436; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
437; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
438; X86-NEXT:    shldl $1, %ecx, %edi
439; X86-NEXT:    orl %esi, %edi
440; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
441; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
442; X86-NEXT:    shldl $1, %edi, %ecx
443; X86-NEXT:    orl %esi, %ecx
444; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
445; X86-NEXT:    addl %edi, %edi
446; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
447; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
448; X86-NEXT:    cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
449; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
450; X86-NEXT:    sbbl %edx, %ecx
451; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
452; X86-NEXT:    sbbl %eax, %ecx
453; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
454; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
455; X86-NEXT:    sarl $31, %ecx
456; X86-NEXT:    movl %ecx, %esi
457; X86-NEXT:    andl $1, %esi
458; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
459; X86-NEXT:    movl %ecx, %edi
460; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
461; X86-NEXT:    movl %ecx, %esi
462; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
463; X86-NEXT:    movl %ecx, %eax
464; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
465; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
466; X86-NEXT:    subl %ecx, %ebx
467; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
468; X86-NEXT:    sbbl %eax, %edx
469; X86-NEXT:    movl %edx, %ebx
470; X86-NEXT:    sbbl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
471; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
472; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
473; X86-NEXT:    sbbl %edi, %eax
474; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
475; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
476; X86-NEXT:    addl $-1, %ecx
477; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
478; X86-NEXT:    adcl $-1, %eax
479; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
480; X86-NEXT:    adcl $-1, %edi
481; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
482; X86-NEXT:    adcl $-1, %esi
483; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
484; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
485; X86-NEXT:    orl %esi, %eax
486; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
487; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
488; X86-NEXT:    orl %edi, %ecx
489; X86-NEXT:    orl %eax, %ecx
490; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
491; X86-NEXT:    jne .LBB4_3
492; X86-NEXT:  # %bb.4:
493; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
494; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
495; X86-NEXT:    movl %ecx, %esi
496; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
497; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
498; X86-NEXT:  .LBB4_7: # %udiv-loop-exit
499; X86-NEXT:    shldl $1, %ebx, %esi
500; X86-NEXT:    orl %edx, %esi
501; X86-NEXT:    shldl $1, %eax, %ebx
502; X86-NEXT:    orl %edx, %ebx
503; X86-NEXT:    shldl $1, %edi, %eax
504; X86-NEXT:    orl %edx, %eax
505; X86-NEXT:    movl %edi, %edx
506; X86-NEXT:    addl %edi, %edx
507; X86-NEXT:    orl %ecx, %edx
508; X86-NEXT:  .LBB4_9: # %udiv-end
509; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
510; X86-NEXT:    xorl %ecx, %esi
511; X86-NEXT:    xorl %ecx, %ebx
512; X86-NEXT:    xorl %ecx, %eax
513; X86-NEXT:    xorl %ecx, %edx
514; X86-NEXT:    subl %ecx, %edx
515; X86-NEXT:    sbbl %ecx, %eax
516; X86-NEXT:    sbbl %ecx, %ebx
517; X86-NEXT:    sbbl %ecx, %esi
518; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
519; X86-NEXT:    movl 44(%ebp), %ecx
520; X86-NEXT:    movl %edx, (%ecx)
521; X86-NEXT:    movl %eax, 4(%ecx)
522; X86-NEXT:    movl %ebx, 8(%ecx)
523; X86-NEXT:    movl %esi, 12(%ecx)
524; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
525; X86-NEXT:    movl 28(%ebp), %ecx
526; X86-NEXT:    movl %ebx, %edi
527; X86-NEXT:    movl %edx, %esi
528; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
529; X86-NEXT:    mull %ecx
530; X86-NEXT:    movl %edx, %ebx
531; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
532; X86-NEXT:    movl %esi, %eax
533; X86-NEXT:    mull %ecx
534; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
535; X86-NEXT:    movl %edx, %ecx
536; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
537; X86-NEXT:    adcl $0, %ebx
538; X86-NEXT:    movl %esi, %eax
539; X86-NEXT:    movl 32(%ebp), %esi
540; X86-NEXT:    mull %esi
541; X86-NEXT:    addl %ecx, %eax
542; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
543; X86-NEXT:    adcl %ebx, %edx
544; X86-NEXT:    movl %edx, %ecx
545; X86-NEXT:    setb %bl
546; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
547; X86-NEXT:    mull %esi
548; X86-NEXT:    addl %ecx, %eax
549; X86-NEXT:    movl %eax, %ecx
550; X86-NEXT:    movzbl %bl, %eax
551; X86-NEXT:    adcl %eax, %edx
552; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
553; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
554; X86-NEXT:    movl 28(%ebp), %eax
555; X86-NEXT:    imull %eax, %ebx
556; X86-NEXT:    mull %edi
557; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
558; X86-NEXT:    imull %esi, %edi
559; X86-NEXT:    addl %edx, %edi
560; X86-NEXT:    addl %ebx, %edi
561; X86-NEXT:    movl 36(%ebp), %eax
562; X86-NEXT:    movl %eax, %esi
563; X86-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
564; X86-NEXT:    movl 40(%ebp), %ebx
565; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
566; X86-NEXT:    imull %edx, %ebx
567; X86-NEXT:    mull %edx
568; X86-NEXT:    addl %edx, %ebx
569; X86-NEXT:    addl %esi, %ebx
570; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
571; X86-NEXT:    adcl %edi, %ebx
572; X86-NEXT:    addl %ecx, %eax
573; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
574; X86-NEXT:    movl 12(%ebp), %edx
575; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
576; X86-NEXT:    movl 16(%ebp), %ecx
577; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
578; X86-NEXT:    movl 20(%ebp), %edi
579; X86-NEXT:    sbbl %eax, %edi
580; X86-NEXT:    movl 24(%ebp), %esi
581; X86-NEXT:    sbbl %ebx, %esi
582; X86-NEXT:    movl 8(%ebp), %eax
583; X86-NEXT:    movl %edx, (%eax)
584; X86-NEXT:    movl %ecx, 4(%eax)
585; X86-NEXT:    movl %edi, 8(%eax)
586; X86-NEXT:    movl %esi, 12(%eax)
587; X86-NEXT:    leal -12(%ebp), %esp
588; X86-NEXT:    popl %esi
589; X86-NEXT:    popl %edi
590; X86-NEXT:    popl %ebx
591; X86-NEXT:    popl %ebp
592; X86-NEXT:    retl $4
593;
594; X64-LABEL: scalar_i128:
595; X64:       # %bb.0:
596; X64-NEXT:    pushq %r15
597; X64-NEXT:    pushq %r14
598; X64-NEXT:    pushq %r13
599; X64-NEXT:    pushq %r12
600; X64-NEXT:    pushq %rbx
601; X64-NEXT:    movq %r8, %r15
602; X64-NEXT:    movq %rcx, %r12
603; X64-NEXT:    movq %rdx, %r13
604; X64-NEXT:    movq %rsi, %rbx
605; X64-NEXT:    movq %rdi, %r14
606; X64-NEXT:    callq __divti3@PLT
607; X64-NEXT:    movq %rdx, %rcx
608; X64-NEXT:    movq %rdx, 8(%r15)
609; X64-NEXT:    movq %rax, (%r15)
610; X64-NEXT:    imulq %rax, %r12
611; X64-NEXT:    mulq %r13
612; X64-NEXT:    addq %r12, %rdx
613; X64-NEXT:    imulq %r13, %rcx
614; X64-NEXT:    addq %rdx, %rcx
615; X64-NEXT:    subq %rax, %r14
616; X64-NEXT:    sbbq %rcx, %rbx
617; X64-NEXT:    movq %r14, %rax
618; X64-NEXT:    movq %rbx, %rdx
619; X64-NEXT:    popq %rbx
620; X64-NEXT:    popq %r12
621; X64-NEXT:    popq %r13
622; X64-NEXT:    popq %r14
623; X64-NEXT:    popq %r15
624; X64-NEXT:    retq
625  %div = sdiv i128 %x, %y
626  store i128 %div, ptr %divdst, align 4
627  %t1 = mul i128 %div, %y
628  %t2 = sub i128 %x, %t1
629  ret i128 %t2
630}
631
632define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwind {
633; X86-LABEL: vector_i128_i8:
634; X86:       # %bb.0:
635; X86-NEXT:    pushl %ebp
636; X86-NEXT:    movl %esp, %ebp
637; X86-NEXT:    pushl %ebx
638; X86-NEXT:    pushl %edi
639; X86-NEXT:    pushl %esi
640; X86-NEXT:    andl $-16, %esp
641; X86-NEXT:    subl $48, %esp
642; X86-NEXT:    movdqa %xmm0, (%esp)
643; X86-NEXT:    movdqa %xmm1, {{[0-9]+}}(%esp)
644; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
645; X86-NEXT:    idivb {{[0-9]+}}(%esp)
646; X86-NEXT:    movzbl %al, %eax
647; X86-NEXT:    movd %eax, %xmm2
648; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
649; X86-NEXT:    idivb {{[0-9]+}}(%esp)
650; X86-NEXT:    movzbl %al, %eax
651; X86-NEXT:    movd %eax, %xmm3
652; X86-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
653; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
654; X86-NEXT:    idivb {{[0-9]+}}(%esp)
655; X86-NEXT:    movzbl %al, %eax
656; X86-NEXT:    movd %eax, %xmm4
657; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
658; X86-NEXT:    idivb {{[0-9]+}}(%esp)
659; X86-NEXT:    movzbl %al, %eax
660; X86-NEXT:    movd %eax, %xmm2
661; X86-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
662; X86-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
663; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
664; X86-NEXT:    idivb {{[0-9]+}}(%esp)
665; X86-NEXT:    movzbl %al, %eax
666; X86-NEXT:    movd %eax, %xmm3
667; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
668; X86-NEXT:    idivb {{[0-9]+}}(%esp)
669; X86-NEXT:    movzbl %al, %eax
670; X86-NEXT:    movd %eax, %xmm4
671; X86-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
672; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
673; X86-NEXT:    idivb {{[0-9]+}}(%esp)
674; X86-NEXT:    movzbl %al, %eax
675; X86-NEXT:    movd %eax, %xmm5
676; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
677; X86-NEXT:    idivb {{[0-9]+}}(%esp)
678; X86-NEXT:    movzbl %al, %eax
679; X86-NEXT:    movd %eax, %xmm3
680; X86-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
681; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
682; X86-NEXT:    idivb {{[0-9]+}}(%esp)
683; X86-NEXT:    movzbl %al, %eax
684; X86-NEXT:    movd %eax, %xmm5
685; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
686; X86-NEXT:    idivb {{[0-9]+}}(%esp)
687; X86-NEXT:    movzbl %al, %eax
688; X86-NEXT:    movd %eax, %xmm6
689; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
690; X86-NEXT:    idivb {{[0-9]+}}(%esp)
691; X86-NEXT:    movzbl %al, %edx
692; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
693; X86-NEXT:    idivb {{[0-9]+}}(%esp)
694; X86-NEXT:    movzbl %al, %esi
695; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
696; X86-NEXT:    idivb {{[0-9]+}}(%esp)
697; X86-NEXT:    movzbl %al, %edi
698; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
699; X86-NEXT:    idivb {{[0-9]+}}(%esp)
700; X86-NEXT:    movzbl %al, %ebx
701; X86-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
702; X86-NEXT:    idivb {{[0-9]+}}(%esp)
703; X86-NEXT:    movl %eax, %ecx
704; X86-NEXT:    movsbl (%esp), %eax
705; X86-NEXT:    idivb {{[0-9]+}}(%esp)
706; X86-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
707; X86-NEXT:    movd %edx, %xmm7
708; X86-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
709; X86-NEXT:    movd %esi, %xmm4
710; X86-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
711; X86-NEXT:    movd %edi, %xmm2
712; X86-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
713; X86-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
714; X86-NEXT:    movd %ebx, %xmm5
715; X86-NEXT:    movzbl %cl, %ecx
716; X86-NEXT:    movd %ecx, %xmm6
717; X86-NEXT:    movl 8(%ebp), %ecx
718; X86-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
719; X86-NEXT:    movzbl %al, %eax
720; X86-NEXT:    movd %eax, %xmm2
721; X86-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
722; X86-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
723; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
724; X86-NEXT:    movdqa %xmm2, %xmm4
725; X86-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
726; X86-NEXT:    movdqa %xmm4, (%ecx)
727; X86-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
728; X86-NEXT:    movdqa %xmm1, %xmm4
729; X86-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
730; X86-NEXT:    pmullw %xmm3, %xmm4
731; X86-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
732; X86-NEXT:    pand %xmm3, %xmm4
733; X86-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
734; X86-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
735; X86-NEXT:    pmullw %xmm2, %xmm1
736; X86-NEXT:    pand %xmm3, %xmm1
737; X86-NEXT:    packuswb %xmm4, %xmm1
738; X86-NEXT:    psubb %xmm1, %xmm0
739; X86-NEXT:    leal -12(%ebp), %esp
740; X86-NEXT:    popl %esi
741; X86-NEXT:    popl %edi
742; X86-NEXT:    popl %ebx
743; X86-NEXT:    popl %ebp
744; X86-NEXT:    retl
745;
746; X64-LABEL: vector_i128_i8:
747; X64:       # %bb.0:
748; X64-NEXT:    pushq %rbp
749; X64-NEXT:    pushq %r15
750; X64-NEXT:    pushq %r14
751; X64-NEXT:    pushq %r13
752; X64-NEXT:    pushq %r12
753; X64-NEXT:    pushq %rbx
754; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
755; X64-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
756; X64-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
757; X64-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
758; X64-NEXT:    idivb -{{[0-9]+}}(%rsp)
759; X64-NEXT:    movzbl %al, %eax
760; X64-NEXT:    movd %eax, %xmm2
761; X64-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
762; X64-NEXT:    idivb -{{[0-9]+}}(%rsp)
763; X64-NEXT:    movzbl %al, %edi
764; X64-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
765; X64-NEXT:    idivb -{{[0-9]+}}(%rsp)
766; X64-NEXT:    movzbl %al, %esi
767; X64-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
768; X64-NEXT:    idivb -{{[0-9]+}}(%rsp)
769; X64-NEXT:    movzbl %al, %r8d
770; X64-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
771; X64-NEXT:    idivb -{{[0-9]+}}(%rsp)
772; X64-NEXT:    movzbl %al, %r9d
773; X64-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
774; X64-NEXT:    idivb -{{[0-9]+}}(%rsp)
775; X64-NEXT:    movzbl %al, %r10d
776; X64-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
777; X64-NEXT:    idivb -{{[0-9]+}}(%rsp)
778; X64-NEXT:    movzbl %al, %r11d
779; X64-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
780; X64-NEXT:    idivb -{{[0-9]+}}(%rsp)
781; X64-NEXT:    movzbl %al, %ebx
782; X64-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
783; X64-NEXT:    idivb -{{[0-9]+}}(%rsp)
784; X64-NEXT:    movzbl %al, %ebp
785; X64-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
786; X64-NEXT:    idivb -{{[0-9]+}}(%rsp)
787; X64-NEXT:    movzbl %al, %r14d
788; X64-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
789; X64-NEXT:    idivb -{{[0-9]+}}(%rsp)
790; X64-NEXT:    movzbl %al, %r15d
791; X64-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
792; X64-NEXT:    idivb -{{[0-9]+}}(%rsp)
793; X64-NEXT:    movzbl %al, %r12d
794; X64-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
795; X64-NEXT:    idivb -{{[0-9]+}}(%rsp)
796; X64-NEXT:    movzbl %al, %r13d
797; X64-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
798; X64-NEXT:    idivb -{{[0-9]+}}(%rsp)
799; X64-NEXT:    movzbl %al, %edx
800; X64-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
801; X64-NEXT:    idivb -{{[0-9]+}}(%rsp)
802; X64-NEXT:    movl %eax, %ecx
803; X64-NEXT:    movsbl -{{[0-9]+}}(%rsp), %eax
804; X64-NEXT:    idivb -{{[0-9]+}}(%rsp)
805; X64-NEXT:    movd %edi, %xmm3
806; X64-NEXT:    movd %esi, %xmm4
807; X64-NEXT:    movd %r8d, %xmm5
808; X64-NEXT:    movd %r9d, %xmm6
809; X64-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
810; X64-NEXT:    movd %r10d, %xmm7
811; X64-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
812; X64-NEXT:    movd %r11d, %xmm4
813; X64-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
814; X64-NEXT:    movd %ebx, %xmm2
815; X64-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
816; X64-NEXT:    movd %ebp, %xmm3
817; X64-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
818; X64-NEXT:    movd %r14d, %xmm4
819; X64-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
820; X64-NEXT:    movd %r15d, %xmm6
821; X64-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
822; X64-NEXT:    movd %r12d, %xmm5
823; X64-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
824; X64-NEXT:    movd %r13d, %xmm3
825; X64-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
826; X64-NEXT:    movd %edx, %xmm6
827; X64-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
828; X64-NEXT:    movzbl %cl, %ecx
829; X64-NEXT:    movd %ecx, %xmm4
830; X64-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
831; X64-NEXT:    movzbl %al, %eax
832; X64-NEXT:    movd %eax, %xmm3
833; X64-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
834; X64-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
835; X64-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
836; X64-NEXT:    movdqa %xmm3, %xmm4
837; X64-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
838; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
839; X64-NEXT:    movdqa %xmm4, (%rax)
840; X64-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
841; X64-NEXT:    movdqa %xmm1, %xmm4
842; X64-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
843; X64-NEXT:    pmullw %xmm2, %xmm4
844; X64-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
845; X64-NEXT:    pand %xmm2, %xmm4
846; X64-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
847; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
848; X64-NEXT:    pmullw %xmm3, %xmm1
849; X64-NEXT:    pand %xmm2, %xmm1
850; X64-NEXT:    packuswb %xmm4, %xmm1
851; X64-NEXT:    psubb %xmm1, %xmm0
852; X64-NEXT:    popq %rbx
853; X64-NEXT:    popq %r12
854; X64-NEXT:    popq %r13
855; X64-NEXT:    popq %r14
856; X64-NEXT:    popq %r15
857; X64-NEXT:    popq %rbp
858; X64-NEXT:    retq
859  %div = sdiv <16 x i8> %x, %y
860  store <16 x i8> %div, ptr %divdst, align 16
861  %t1 = mul <16 x i8> %div, %y
862  %t2 = sub <16 x i8> %x, %t1
863  ret <16 x i8> %t2
864}
865
866define <8 x i16> @vector_i128_i16(<8 x i16> %x, <8 x i16> %y, ptr %divdst) nounwind {
867; X86-LABEL: vector_i128_i16:
868; X86:       # %bb.0:
869; X86-NEXT:    pushl %esi
870; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
871; X86-NEXT:    pextrw $7, %xmm0, %eax
872; X86-NEXT:    pextrw $7, %xmm1, %esi
873; X86-NEXT:    # kill: def $ax killed $ax killed $eax
874; X86-NEXT:    cwtd
875; X86-NEXT:    idivw %si
876; X86-NEXT:    # kill: def $ax killed $ax def $eax
877; X86-NEXT:    movd %eax, %xmm2
878; X86-NEXT:    pextrw $6, %xmm0, %eax
879; X86-NEXT:    pextrw $6, %xmm1, %esi
880; X86-NEXT:    # kill: def $ax killed $ax killed $eax
881; X86-NEXT:    cwtd
882; X86-NEXT:    idivw %si
883; X86-NEXT:    # kill: def $ax killed $ax def $eax
884; X86-NEXT:    movd %eax, %xmm3
885; X86-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
886; X86-NEXT:    pextrw $5, %xmm0, %eax
887; X86-NEXT:    pextrw $5, %xmm1, %esi
888; X86-NEXT:    # kill: def $ax killed $ax killed $eax
889; X86-NEXT:    cwtd
890; X86-NEXT:    idivw %si
891; X86-NEXT:    # kill: def $ax killed $ax def $eax
892; X86-NEXT:    movd %eax, %xmm4
893; X86-NEXT:    pextrw $4, %xmm0, %eax
894; X86-NEXT:    pextrw $4, %xmm1, %esi
895; X86-NEXT:    # kill: def $ax killed $ax killed $eax
896; X86-NEXT:    cwtd
897; X86-NEXT:    idivw %si
898; X86-NEXT:    # kill: def $ax killed $ax def $eax
899; X86-NEXT:    movd %eax, %xmm2
900; X86-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
901; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
902; X86-NEXT:    pextrw $3, %xmm0, %eax
903; X86-NEXT:    pextrw $3, %xmm1, %esi
904; X86-NEXT:    # kill: def $ax killed $ax killed $eax
905; X86-NEXT:    cwtd
906; X86-NEXT:    idivw %si
907; X86-NEXT:    # kill: def $ax killed $ax def $eax
908; X86-NEXT:    movd %eax, %xmm4
909; X86-NEXT:    pextrw $2, %xmm0, %eax
910; X86-NEXT:    pextrw $2, %xmm1, %esi
911; X86-NEXT:    # kill: def $ax killed $ax killed $eax
912; X86-NEXT:    cwtd
913; X86-NEXT:    idivw %si
914; X86-NEXT:    # kill: def $ax killed $ax def $eax
915; X86-NEXT:    movd %eax, %xmm3
916; X86-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
917; X86-NEXT:    pextrw $1, %xmm0, %eax
918; X86-NEXT:    pextrw $1, %xmm1, %esi
919; X86-NEXT:    # kill: def $ax killed $ax killed $eax
920; X86-NEXT:    cwtd
921; X86-NEXT:    idivw %si
922; X86-NEXT:    # kill: def $ax killed $ax def $eax
923; X86-NEXT:    movd %eax, %xmm4
924; X86-NEXT:    movd %xmm0, %eax
925; X86-NEXT:    movd %xmm1, %esi
926; X86-NEXT:    # kill: def $ax killed $ax killed $eax
927; X86-NEXT:    cwtd
928; X86-NEXT:    idivw %si
929; X86-NEXT:    # kill: def $ax killed $ax def $eax
930; X86-NEXT:    movd %eax, %xmm5
931; X86-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
932; X86-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
933; X86-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0]
934; X86-NEXT:    movdqa %xmm5, (%ecx)
935; X86-NEXT:    pmullw %xmm1, %xmm5
936; X86-NEXT:    psubw %xmm5, %xmm0
937; X86-NEXT:    popl %esi
938; X86-NEXT:    retl
939;
940; X64-LABEL: vector_i128_i16:
941; X64:       # %bb.0:
942; X64-NEXT:    pextrw $7, %xmm0, %eax
943; X64-NEXT:    pextrw $7, %xmm1, %ecx
944; X64-NEXT:    # kill: def $ax killed $ax killed $eax
945; X64-NEXT:    cwtd
946; X64-NEXT:    idivw %cx
947; X64-NEXT:    # kill: def $ax killed $ax def $eax
948; X64-NEXT:    movd %eax, %xmm2
949; X64-NEXT:    pextrw $6, %xmm0, %eax
950; X64-NEXT:    pextrw $6, %xmm1, %ecx
951; X64-NEXT:    # kill: def $ax killed $ax killed $eax
952; X64-NEXT:    cwtd
953; X64-NEXT:    idivw %cx
954; X64-NEXT:    # kill: def $ax killed $ax def $eax
955; X64-NEXT:    movd %eax, %xmm3
956; X64-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
957; X64-NEXT:    pextrw $5, %xmm0, %eax
958; X64-NEXT:    pextrw $5, %xmm1, %ecx
959; X64-NEXT:    # kill: def $ax killed $ax killed $eax
960; X64-NEXT:    cwtd
961; X64-NEXT:    idivw %cx
962; X64-NEXT:    # kill: def $ax killed $ax def $eax
963; X64-NEXT:    movd %eax, %xmm4
964; X64-NEXT:    pextrw $4, %xmm0, %eax
965; X64-NEXT:    pextrw $4, %xmm1, %ecx
966; X64-NEXT:    # kill: def $ax killed $ax killed $eax
967; X64-NEXT:    cwtd
968; X64-NEXT:    idivw %cx
969; X64-NEXT:    # kill: def $ax killed $ax def $eax
970; X64-NEXT:    movd %eax, %xmm2
971; X64-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
972; X64-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
973; X64-NEXT:    pextrw $3, %xmm0, %eax
974; X64-NEXT:    pextrw $3, %xmm1, %ecx
975; X64-NEXT:    # kill: def $ax killed $ax killed $eax
976; X64-NEXT:    cwtd
977; X64-NEXT:    idivw %cx
978; X64-NEXT:    # kill: def $ax killed $ax def $eax
979; X64-NEXT:    movd %eax, %xmm3
980; X64-NEXT:    pextrw $2, %xmm0, %eax
981; X64-NEXT:    pextrw $2, %xmm1, %ecx
982; X64-NEXT:    # kill: def $ax killed $ax killed $eax
983; X64-NEXT:    cwtd
984; X64-NEXT:    idivw %cx
985; X64-NEXT:    # kill: def $ax killed $ax def $eax
986; X64-NEXT:    movd %eax, %xmm4
987; X64-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
988; X64-NEXT:    pextrw $1, %xmm0, %eax
989; X64-NEXT:    pextrw $1, %xmm1, %ecx
990; X64-NEXT:    # kill: def $ax killed $ax killed $eax
991; X64-NEXT:    cwtd
992; X64-NEXT:    idivw %cx
993; X64-NEXT:    # kill: def $ax killed $ax def $eax
994; X64-NEXT:    movd %eax, %xmm3
995; X64-NEXT:    movd %xmm0, %eax
996; X64-NEXT:    movd %xmm1, %ecx
997; X64-NEXT:    # kill: def $ax killed $ax killed $eax
998; X64-NEXT:    cwtd
999; X64-NEXT:    idivw %cx
1000; X64-NEXT:    # kill: def $ax killed $ax def $eax
1001; X64-NEXT:    movd %eax, %xmm5
1002; X64-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
1003; X64-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
1004; X64-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0]
1005; X64-NEXT:    movdqa %xmm5, (%rdi)
1006; X64-NEXT:    pmullw %xmm1, %xmm5
1007; X64-NEXT:    psubw %xmm5, %xmm0
1008; X64-NEXT:    retq
1009  %div = sdiv <8 x i16> %x, %y
1010  store <8 x i16> %div, ptr %divdst, align 16
1011  %t1 = mul <8 x i16> %div, %y
1012  %t2 = sub <8 x i16> %x, %t1
1013  ret <8 x i16> %t2
1014}
1015
1016define <4 x i32> @vector_i128_i32(<4 x i32> %x, <4 x i32> %y, ptr %divdst) nounwind {
1017; X86-LABEL: vector_i128_i32:
1018; X86:       # %bb.0:
1019; X86-NEXT:    pushl %esi
1020; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1021; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
1022; X86-NEXT:    movd %xmm2, %eax
1023; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
1024; X86-NEXT:    movd %xmm2, %esi
1025; X86-NEXT:    cltd
1026; X86-NEXT:    idivl %esi
1027; X86-NEXT:    movd %eax, %xmm3
1028; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
1029; X86-NEXT:    movd %xmm2, %eax
1030; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
1031; X86-NEXT:    movd %xmm2, %esi
1032; X86-NEXT:    cltd
1033; X86-NEXT:    idivl %esi
1034; X86-NEXT:    movd %eax, %xmm2
1035; X86-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1036; X86-NEXT:    movd %xmm0, %eax
1037; X86-NEXT:    movd %xmm1, %esi
1038; X86-NEXT:    cltd
1039; X86-NEXT:    idivl %esi
1040; X86-NEXT:    movd %eax, %xmm3
1041; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
1042; X86-NEXT:    movd %xmm4, %eax
1043; X86-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
1044; X86-NEXT:    movd %xmm4, %esi
1045; X86-NEXT:    cltd
1046; X86-NEXT:    idivl %esi
1047; X86-NEXT:    movd %eax, %xmm4
1048; X86-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
1049; X86-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
1050; X86-NEXT:    movdqa %xmm3, (%ecx)
1051; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3]
1052; X86-NEXT:    pmuludq %xmm1, %xmm3
1053; X86-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
1054; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1055; X86-NEXT:    pmuludq %xmm2, %xmm1
1056; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1057; X86-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
1058; X86-NEXT:    psubd %xmm3, %xmm0
1059; X86-NEXT:    popl %esi
1060; X86-NEXT:    retl
1061;
1062; X64-LABEL: vector_i128_i32:
1063; X64:       # %bb.0:
1064; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
1065; X64-NEXT:    movd %xmm2, %eax
1066; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
1067; X64-NEXT:    movd %xmm2, %ecx
1068; X64-NEXT:    cltd
1069; X64-NEXT:    idivl %ecx
1070; X64-NEXT:    movd %eax, %xmm2
1071; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
1072; X64-NEXT:    movd %xmm3, %eax
1073; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
1074; X64-NEXT:    movd %xmm3, %ecx
1075; X64-NEXT:    cltd
1076; X64-NEXT:    idivl %ecx
1077; X64-NEXT:    movd %eax, %xmm3
1078; X64-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
1079; X64-NEXT:    movd %xmm0, %eax
1080; X64-NEXT:    movd %xmm1, %ecx
1081; X64-NEXT:    cltd
1082; X64-NEXT:    idivl %ecx
1083; X64-NEXT:    movd %eax, %xmm2
1084; X64-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
1085; X64-NEXT:    movd %xmm4, %eax
1086; X64-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
1087; X64-NEXT:    movd %xmm4, %ecx
1088; X64-NEXT:    cltd
1089; X64-NEXT:    idivl %ecx
1090; X64-NEXT:    movd %eax, %xmm4
1091; X64-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
1092; X64-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1093; X64-NEXT:    movdqa %xmm2, (%rdi)
1094; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
1095; X64-NEXT:    pmuludq %xmm1, %xmm2
1096; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1097; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
1098; X64-NEXT:    pmuludq %xmm3, %xmm1
1099; X64-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1100; X64-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1101; X64-NEXT:    psubd %xmm2, %xmm0
1102; X64-NEXT:    retq
1103  %div = sdiv <4 x i32> %x, %y
1104  store <4 x i32> %div, ptr %divdst, align 16
1105  %t1 = mul <4 x i32> %div, %y
1106  %t2 = sub <4 x i32> %x, %t1
1107  ret <4 x i32> %t2
1108}
1109
1110define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, ptr %divdst) nounwind {
1111; X86-LABEL: vector_i128_i64:
1112; X86:       # %bb.0:
1113; X86-NEXT:    pushl %esi
1114; X86-NEXT:    subl $64, %esp
1115; X86-NEXT:    movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
1116; X86-NEXT:    movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
1117; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
1118; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
1119; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
1120; X86-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
1121; X86-NEXT:    movd %xmm2, {{[0-9]+}}(%esp)
1122; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
1123; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
1124; X86-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
1125; X86-NEXT:    movd %xmm1, (%esp)
1126; X86-NEXT:    calll __divdi3
1127; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
1128; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1129; X86-NEXT:    movd %xmm0, {{[0-9]+}}(%esp)
1130; X86-NEXT:    movd %xmm1, {{[0-9]+}}(%esp)
1131; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
1132; X86-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
1133; X86-NEXT:    movd %xmm0, {{[0-9]+}}(%esp)
1134; X86-NEXT:    movd %xmm1, (%esp)
1135; X86-NEXT:    movd %edx, %xmm0
1136; X86-NEXT:    movd %eax, %xmm1
1137; X86-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1138; X86-NEXT:    movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
1139; X86-NEXT:    calll __divdi3
1140; X86-NEXT:    movd %edx, %xmm1
1141; X86-NEXT:    movd %eax, %xmm3
1142; X86-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
1143; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
1144; X86-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
1145; X86-NEXT:    movdqa %xmm3, (%esi)
1146; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
1147; X86-NEXT:    movdqa %xmm0, %xmm1
1148; X86-NEXT:    psrlq $32, %xmm1
1149; X86-NEXT:    pmuludq %xmm3, %xmm1
1150; X86-NEXT:    movdqa %xmm3, %xmm2
1151; X86-NEXT:    psrlq $32, %xmm2
1152; X86-NEXT:    pmuludq %xmm0, %xmm2
1153; X86-NEXT:    paddq %xmm1, %xmm2
1154; X86-NEXT:    psllq $32, %xmm2
1155; X86-NEXT:    pmuludq %xmm0, %xmm3
1156; X86-NEXT:    paddq %xmm2, %xmm3
1157; X86-NEXT:    movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
1158; X86-NEXT:    psubq %xmm3, %xmm0
1159; X86-NEXT:    addl $64, %esp
1160; X86-NEXT:    popl %esi
1161; X86-NEXT:    retl
1162;
1163; X64-LABEL: vector_i128_i64:
1164; X64:       # %bb.0:
1165; X64-NEXT:    movq %xmm0, %rax
1166; X64-NEXT:    movq %xmm1, %rcx
1167; X64-NEXT:    cqto
1168; X64-NEXT:    idivq %rcx
1169; X64-NEXT:    movq %rax, %xmm2
1170; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
1171; X64-NEXT:    movq %xmm3, %rax
1172; X64-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
1173; X64-NEXT:    movq %xmm3, %rcx
1174; X64-NEXT:    cqto
1175; X64-NEXT:    idivq %rcx
1176; X64-NEXT:    movq %rax, %xmm3
1177; X64-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
1178; X64-NEXT:    movdqa %xmm2, (%rdi)
1179; X64-NEXT:    movdqa %xmm1, %xmm3
1180; X64-NEXT:    psrlq $32, %xmm3
1181; X64-NEXT:    pmuludq %xmm2, %xmm3
1182; X64-NEXT:    movdqa %xmm2, %xmm4
1183; X64-NEXT:    psrlq $32, %xmm4
1184; X64-NEXT:    pmuludq %xmm1, %xmm4
1185; X64-NEXT:    paddq %xmm3, %xmm4
1186; X64-NEXT:    psllq $32, %xmm4
1187; X64-NEXT:    pmuludq %xmm1, %xmm2
1188; X64-NEXT:    paddq %xmm4, %xmm2
1189; X64-NEXT:    psubq %xmm2, %xmm0
1190; X64-NEXT:    retq
1191  %div = sdiv <2 x i64> %x, %y
1192  store <2 x i64> %div, ptr %divdst, align 16
1193  %t1 = mul <2 x i64> %div, %y
1194  %t2 = sub <2 x i64> %x, %t1
1195  ret <2 x i64> %t2
1196}
1197
1198; Special tests.
1199
1200define i32 @scalar_i32_commutative(i32 %x, ptr %ysrc, ptr %divdst) nounwind {
1201; X86-LABEL: scalar_i32_commutative:
1202; X86:       # %bb.0:
1203; X86-NEXT:    pushl %edi
1204; X86-NEXT:    pushl %esi
1205; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
1206; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1207; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1208; X86-NEXT:    movl (%eax), %edi
1209; X86-NEXT:    movl %ecx, %eax
1210; X86-NEXT:    cltd
1211; X86-NEXT:    idivl %edi
1212; X86-NEXT:    movl %eax, (%esi)
1213; X86-NEXT:    imull %eax, %edi
1214; X86-NEXT:    subl %edi, %ecx
1215; X86-NEXT:    movl %ecx, %eax
1216; X86-NEXT:    popl %esi
1217; X86-NEXT:    popl %edi
1218; X86-NEXT:    retl
1219;
1220; X64-LABEL: scalar_i32_commutative:
1221; X64:       # %bb.0:
1222; X64-NEXT:    movq %rdx, %rcx
1223; X64-NEXT:    movl (%rsi), %esi
1224; X64-NEXT:    movl %edi, %eax
1225; X64-NEXT:    cltd
1226; X64-NEXT:    idivl %esi
1227; X64-NEXT:    movl %eax, (%rcx)
1228; X64-NEXT:    imull %eax, %esi
1229; X64-NEXT:    subl %esi, %edi
1230; X64-NEXT:    movl %edi, %eax
1231; X64-NEXT:    retq
1232  %y = load i32, ptr %ysrc, align 4
1233  %div = sdiv i32 %x, %y
1234  store i32 %div, ptr %divdst, align 4
1235  %t1 = mul i32 %y, %div ; commutative
1236  %t2 = sub i32 %x, %t1
1237  ret i32 %t2
1238}
1239
1240; We do not care about extra uses.
1241define i32 @extrause(i32 %x, i32 %y, ptr %divdst, ptr %t1dst) nounwind {
1242; X86-LABEL: extrause:
1243; X86:       # %bb.0:
1244; X86-NEXT:    pushl %ebx
1245; X86-NEXT:    pushl %edi
1246; X86-NEXT:    pushl %esi
1247; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
1248; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
1249; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1250; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
1251; X86-NEXT:    movl %ecx, %eax
1252; X86-NEXT:    cltd
1253; X86-NEXT:    idivl %ebx
1254; X86-NEXT:    movl %eax, (%edi)
1255; X86-NEXT:    imull %ebx, %eax
1256; X86-NEXT:    movl %eax, (%esi)
1257; X86-NEXT:    subl %eax, %ecx
1258; X86-NEXT:    movl %ecx, %eax
1259; X86-NEXT:    popl %esi
1260; X86-NEXT:    popl %edi
1261; X86-NEXT:    popl %ebx
1262; X86-NEXT:    retl
1263;
1264; X64-LABEL: extrause:
1265; X64:       # %bb.0:
1266; X64-NEXT:    movq %rdx, %r8
1267; X64-NEXT:    movl %edi, %eax
1268; X64-NEXT:    cltd
1269; X64-NEXT:    idivl %esi
1270; X64-NEXT:    movl %eax, (%r8)
1271; X64-NEXT:    imull %esi, %eax
1272; X64-NEXT:    movl %eax, (%rcx)
1273; X64-NEXT:    subl %eax, %edi
1274; X64-NEXT:    movl %edi, %eax
1275; X64-NEXT:    retq
1276  %div = sdiv i32 %x, %y
1277  store i32 %div, ptr %divdst, align 4
1278  %t1 = mul i32 %div, %y
1279  store i32 %t1, ptr %t1dst, align 4
1280  %t2 = sub i32 %x, %t1
1281  ret i32 %t2
1282}
1283
1284; 'rem' should appear next to 'div'.
1285define i32 @multiple_bb(i32 %x, i32 %y, ptr %divdst, i1 zeroext %store_srem, ptr %sremdst) nounwind {
1286; X86-LABEL: multiple_bb:
1287; X86:       # %bb.0:
1288; X86-NEXT:    pushl %ebx
1289; X86-NEXT:    pushl %edi
1290; X86-NEXT:    pushl %esi
1291; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
1292; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1293; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
1294; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
1295; X86-NEXT:    movl %ecx, %eax
1296; X86-NEXT:    cltd
1297; X86-NEXT:    idivl %esi
1298; X86-NEXT:    movl %eax, (%edi)
1299; X86-NEXT:    testb %bl, %bl
1300; X86-NEXT:    je .LBB11_2
1301; X86-NEXT:  # %bb.1: # %do_srem
1302; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
1303; X86-NEXT:    movl %eax, %edi
1304; X86-NEXT:    imull %esi, %edi
1305; X86-NEXT:    subl %edi, %ecx
1306; X86-NEXT:    movl %ecx, (%edx)
1307; X86-NEXT:  .LBB11_2: # %end
1308; X86-NEXT:    popl %esi
1309; X86-NEXT:    popl %edi
1310; X86-NEXT:    popl %ebx
1311; X86-NEXT:    retl
1312;
1313; X64-LABEL: multiple_bb:
1314; X64:       # %bb.0:
1315; X64-NEXT:    movq %rdx, %r9
1316; X64-NEXT:    movl %edi, %eax
1317; X64-NEXT:    cltd
1318; X64-NEXT:    idivl %esi
1319; X64-NEXT:    movl %eax, (%r9)
1320; X64-NEXT:    testl %ecx, %ecx
1321; X64-NEXT:    je .LBB11_2
1322; X64-NEXT:  # %bb.1: # %do_srem
1323; X64-NEXT:    movl %eax, %ecx
1324; X64-NEXT:    imull %esi, %ecx
1325; X64-NEXT:    subl %ecx, %edi
1326; X64-NEXT:    movl %edi, (%r8)
1327; X64-NEXT:  .LBB11_2: # %end
1328; X64-NEXT:    retq
1329  %div = sdiv i32 %x, %y
1330  store i32 %div, ptr %divdst, align 4
1331  br i1 %store_srem, label %do_srem, label %end
1332do_srem:
1333  %t1 = mul i32 %div, %y
1334  %t2 = sub i32 %x, %t1
1335  store i32 %t2, ptr %sremdst, align 4
1336  br label %end
1337end:
1338  ret i32 %div
1339}
1340
1341define i32 @negative_different_x(i32 %x0, i32 %x1, i32 %y, ptr %divdst) nounwind {
1342; X86-LABEL: negative_different_x:
1343; X86:       # %bb.0:
1344; X86-NEXT:    pushl %edi
1345; X86-NEXT:    pushl %esi
1346; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1347; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
1348; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1349; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
1350; X86-NEXT:    cltd
1351; X86-NEXT:    idivl %edi
1352; X86-NEXT:    movl %eax, (%esi)
1353; X86-NEXT:    imull %edi, %eax
1354; X86-NEXT:    subl %eax, %ecx
1355; X86-NEXT:    movl %ecx, %eax
1356; X86-NEXT:    popl %esi
1357; X86-NEXT:    popl %edi
1358; X86-NEXT:    retl
1359;
1360; X64-LABEL: negative_different_x:
1361; X64:       # %bb.0:
1362; X64-NEXT:    movl %edx, %r8d
1363; X64-NEXT:    movl %edi, %eax
1364; X64-NEXT:    cltd
1365; X64-NEXT:    idivl %r8d
1366; X64-NEXT:    movl %eax, (%rcx)
1367; X64-NEXT:    imull %r8d, %eax
1368; X64-NEXT:    subl %eax, %esi
1369; X64-NEXT:    movl %esi, %eax
1370; X64-NEXT:    retq
1371  %div = sdiv i32 %x0, %y ; not %x1
1372  store i32 %div, ptr %divdst, align 4
1373  %t1 = mul i32 %div, %y
1374  %t2 = sub i32 %x1, %t1 ; not %x0
1375  ret i32 %t2
1376}
1377
1378define i32 @negative_different_y(i32 %x0, i32 %x1, i32 %y, i32 %z, ptr %divdst) nounwind {
1379; X86-LABEL: negative_different_y:
1380; X86:       # %bb.0:
1381; X86-NEXT:    pushl %esi
1382; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
1383; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1384; X86-NEXT:    movl %ecx, %eax
1385; X86-NEXT:    cltd
1386; X86-NEXT:    idivl {{[0-9]+}}(%esp)
1387; X86-NEXT:    movl %eax, (%esi)
1388; X86-NEXT:    imull {{[0-9]+}}(%esp), %eax
1389; X86-NEXT:    subl %eax, %ecx
1390; X86-NEXT:    movl %ecx, %eax
1391; X86-NEXT:    popl %esi
1392; X86-NEXT:    retl
1393;
1394; X64-LABEL: negative_different_y:
1395; X64:       # %bb.0:
1396; X64-NEXT:    movl %edx, %edi
1397; X64-NEXT:    movl %esi, %eax
1398; X64-NEXT:    cltd
1399; X64-NEXT:    idivl %ecx
1400; X64-NEXT:    movl %eax, (%r8)
1401; X64-NEXT:    imull %eax, %edi
1402; X64-NEXT:    subl %edi, %esi
1403; X64-NEXT:    movl %esi, %eax
1404; X64-NEXT:    retq
1405  %div = sdiv i32 %x1, %z ; not %x0
1406  store i32 %div, ptr %divdst, align 4
1407  %t1 = mul i32 %div, %y
1408  %t2 = sub i32 %x1, %t1
1409  ret i32 %t2
1410}
1411
1412define i32 @negative_inverted_division(i32 %x0, i32 %x1, i32 %y, ptr %divdst) nounwind {
1413; X86-LABEL: negative_inverted_division:
1414; X86:       # %bb.0:
1415; X86-NEXT:    pushl %esi
1416; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
1417; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1418; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
1419; X86-NEXT:    cltd
1420; X86-NEXT:    idivl %ecx
1421; X86-NEXT:    movl %eax, (%esi)
1422; X86-NEXT:    imull %ecx, %eax
1423; X86-NEXT:    subl %eax, %ecx
1424; X86-NEXT:    movl %ecx, %eax
1425; X86-NEXT:    popl %esi
1426; X86-NEXT:    retl
1427;
1428; X64-LABEL: negative_inverted_division:
1429; X64:       # %bb.0:
1430; X64-NEXT:    movl %edi, %eax
1431; X64-NEXT:    cltd
1432; X64-NEXT:    idivl %esi
1433; X64-NEXT:    movl %eax, (%rcx)
1434; X64-NEXT:    imull %esi, %eax
1435; X64-NEXT:    subl %eax, %esi
1436; X64-NEXT:    movl %esi, %eax
1437; X64-NEXT:    retq
1438  %div = sdiv i32 %x0, %x1 ; inverted division
1439  store i32 %div, ptr %divdst, align 4
1440  %t1 = mul i32 %div, %x1
1441  %t2 = sub i32 %x1, %t1
1442  ret i32 %t2
1443}
1444