xref: /llvm-project/llvm/test/CodeGen/RISCV/rvv/vxrm-insert-out-of-loop.ll (revision eeac0ffaf46cf9f9b0f680b9940cc4b68a0286d8)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2
3; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+m \
4; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=RV32
5; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+m -mcpu=sifive-p670 \
6; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=RV64P670
7; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+m -mcpu=spacemit-x60 \
8; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=RV64X60
9; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+m \
10; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=RV64
11
12
13; test1
14define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_dst_stride, ptr nocapture noundef readonly %src1, i32 noundef signext %i_src1_stride, ptr nocapture noundef readonly %src2, i32 noundef signext %i_src2_stride, i32 noundef signext %i_width, i32 noundef signext %i_height) {
15; RV32-LABEL: test1:
16; RV32:       # %bb.0: # %entry
17; RV32-NEXT:    blez a7, .LBB0_17
18; RV32-NEXT:  # %bb.1: # %for.cond1.preheader.lr.ph
19; RV32-NEXT:    blez a6, .LBB0_17
20; RV32-NEXT:  # %bb.2: # %for.cond1.preheader.us.preheader
21; RV32-NEXT:    addi t0, a7, -1
22; RV32-NEXT:    csrr t2, vlenb
23; RV32-NEXT:    mul t3, a1, t0
24; RV32-NEXT:    mul t4, a3, t0
25; RV32-NEXT:    mul t5, a5, t0
26; RV32-NEXT:    slli t1, t2, 1
27; RV32-NEXT:    li t6, 32
28; RV32-NEXT:    mv t0, t1
29; RV32-NEXT:    bnez zero, .LBB0_4
30; RV32-NEXT:  # %bb.3: # %for.cond1.preheader.us.preheader
31; RV32-NEXT:    li t0, 32
32; RV32-NEXT:  .LBB0_4: # %for.cond1.preheader.us.preheader
33; RV32-NEXT:    addi sp, sp, -16
34; RV32-NEXT:    .cfi_def_cfa_offset 16
35; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
36; RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
37; RV32-NEXT:    sw s2, 4(sp) # 4-byte Folded Spill
38; RV32-NEXT:    .cfi_offset s0, -4
39; RV32-NEXT:    .cfi_offset s1, -8
40; RV32-NEXT:    .cfi_offset s2, -12
41; RV32-NEXT:    add t3, a0, t3
42; RV32-NEXT:    add t4, a2, t4
43; RV32-NEXT:    add s0, a4, t5
44; RV32-NEXT:    bltu t6, t1, .LBB0_6
45; RV32-NEXT:  # %bb.5: # %for.cond1.preheader.us.preheader
46; RV32-NEXT:    li t1, 32
47; RV32-NEXT:  .LBB0_6: # %for.cond1.preheader.us.preheader
48; RV32-NEXT:    add t3, t3, a6
49; RV32-NEXT:    add t5, t4, a6
50; RV32-NEXT:    add t4, s0, a6
51; RV32-NEXT:    beqz zero, .LBB0_8
52; RV32-NEXT:  # %bb.7: # %for.cond1.preheader.us.preheader
53; RV32-NEXT:    mv t1, t0
54; RV32-NEXT:  .LBB0_8: # %for.cond1.preheader.us.preheader
55; RV32-NEXT:    li t0, 0
56; RV32-NEXT:    sltu t5, a0, t5
57; RV32-NEXT:    sltu t6, a2, t3
58; RV32-NEXT:    and t5, t5, t6
59; RV32-NEXT:    sltu t4, a0, t4
60; RV32-NEXT:    sltu t3, a4, t3
61; RV32-NEXT:    and t3, t4, t3
62; RV32-NEXT:    or t4, a1, a3
63; RV32-NEXT:    slti t4, t4, 0
64; RV32-NEXT:    or t4, t5, t4
65; RV32-NEXT:    or t5, a1, a5
66; RV32-NEXT:    sltu t1, a6, t1
67; RV32-NEXT:    slti t5, t5, 0
68; RV32-NEXT:    or t3, t3, t5
69; RV32-NEXT:    or t3, t4, t3
70; RV32-NEXT:    or t1, t1, t3
71; RV32-NEXT:    andi t1, t1, 1
72; RV32-NEXT:    slli t2, t2, 1
73; RV32-NEXT:    j .LBB0_10
74; RV32-NEXT:  .LBB0_9: # %for.cond1.for.cond.cleanup3_crit_edge.us
75; RV32-NEXT:    # in Loop: Header=BB0_10 Depth=1
76; RV32-NEXT:    add a0, a0, a1
77; RV32-NEXT:    add a2, a2, a3
78; RV32-NEXT:    addi t0, t0, 1
79; RV32-NEXT:    add a4, a4, a5
80; RV32-NEXT:    beq t0, a7, .LBB0_16
81; RV32-NEXT:  .LBB0_10: # %for.cond1.preheader.us
82; RV32-NEXT:    # =>This Loop Header: Depth=1
83; RV32-NEXT:    # Child Loop BB0_13 Depth 2
84; RV32-NEXT:    # Child Loop BB0_15 Depth 2
85; RV32-NEXT:    beqz t1, .LBB0_12
86; RV32-NEXT:  # %bb.11: # in Loop: Header=BB0_10 Depth=1
87; RV32-NEXT:    li t4, 0
88; RV32-NEXT:    li t3, 0
89; RV32-NEXT:    j .LBB0_15
90; RV32-NEXT:  .LBB0_12: # %vector.ph
91; RV32-NEXT:    # in Loop: Header=BB0_10 Depth=1
92; RV32-NEXT:    li t3, 0
93; RV32-NEXT:    neg t4, t2
94; RV32-NEXT:    and t4, t4, a6
95; RV32-NEXT:    csrwi vxrm, 0
96; RV32-NEXT:    li t6, 0
97; RV32-NEXT:    li t5, 0
98; RV32-NEXT:    vsetvli s0, zero, e8, m2, ta, ma
99; RV32-NEXT:  .LBB0_13: # %vector.body
100; RV32-NEXT:    # Parent Loop BB0_10 Depth=1
101; RV32-NEXT:    # => This Inner Loop Header: Depth=2
102; RV32-NEXT:    add s0, a2, t6
103; RV32-NEXT:    add s1, a4, t6
104; RV32-NEXT:    vl2r.v v8, (s0)
105; RV32-NEXT:    add s0, a0, t6
106; RV32-NEXT:    vl2r.v v10, (s1)
107; RV32-NEXT:    add s1, t6, t2
108; RV32-NEXT:    sltu t6, s1, t6
109; RV32-NEXT:    add t5, t5, t6
110; RV32-NEXT:    xor t6, s1, t4
111; RV32-NEXT:    vaaddu.vv v8, v8, v10
112; RV32-NEXT:    or s2, t6, t5
113; RV32-NEXT:    vs2r.v v8, (s0)
114; RV32-NEXT:    mv t6, s1
115; RV32-NEXT:    bnez s2, .LBB0_13
116; RV32-NEXT:  # %bb.14: # %middle.block
117; RV32-NEXT:    # in Loop: Header=BB0_10 Depth=1
118; RV32-NEXT:    beq t4, a6, .LBB0_9
119; RV32-NEXT:  .LBB0_15: # %for.body4.us
120; RV32-NEXT:    # Parent Loop BB0_10 Depth=1
121; RV32-NEXT:    # => This Inner Loop Header: Depth=2
122; RV32-NEXT:    add t5, a2, t4
123; RV32-NEXT:    add t6, a4, t4
124; RV32-NEXT:    add s0, a0, t4
125; RV32-NEXT:    lbu t5, 0(t5)
126; RV32-NEXT:    lbu t6, 0(t6)
127; RV32-NEXT:    addi t4, t4, 1
128; RV32-NEXT:    seqz s1, t4
129; RV32-NEXT:    add t3, t3, s1
130; RV32-NEXT:    add t5, t5, t6
131; RV32-NEXT:    xor t6, t4, a6
132; RV32-NEXT:    addi t5, t5, 1
133; RV32-NEXT:    srli t5, t5, 1
134; RV32-NEXT:    or t6, t6, t3
135; RV32-NEXT:    sb t5, 0(s0)
136; RV32-NEXT:    bnez t6, .LBB0_15
137; RV32-NEXT:    j .LBB0_9
138; RV32-NEXT:  .LBB0_16:
139; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
140; RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
141; RV32-NEXT:    lw s2, 4(sp) # 4-byte Folded Reload
142; RV32-NEXT:    .cfi_restore s0
143; RV32-NEXT:    .cfi_restore s1
144; RV32-NEXT:    .cfi_restore s2
145; RV32-NEXT:    addi sp, sp, 16
146; RV32-NEXT:    .cfi_def_cfa_offset 0
147; RV32-NEXT:  .LBB0_17: # %for.cond.cleanup
148; RV32-NEXT:    ret
149;
150; RV64P670-LABEL: test1:
151; RV64P670:       # %bb.0: # %entry
152; RV64P670-NEXT:    csrwi vxrm, 0
153; RV64P670-NEXT:    blez a7, .LBB0_12
154; RV64P670-NEXT:  # %bb.1: # %for.cond1.preheader.lr.ph
155; RV64P670-NEXT:    blez a6, .LBB0_12
156; RV64P670-NEXT:  # %bb.2: # %for.cond1.preheader.us.preheader
157; RV64P670-NEXT:    addi sp, sp, -48
158; RV64P670-NEXT:    .cfi_def_cfa_offset 48
159; RV64P670-NEXT:    sd s0, 40(sp) # 8-byte Folded Spill
160; RV64P670-NEXT:    sd s1, 32(sp) # 8-byte Folded Spill
161; RV64P670-NEXT:    sd s2, 24(sp) # 8-byte Folded Spill
162; RV64P670-NEXT:    sd s3, 16(sp) # 8-byte Folded Spill
163; RV64P670-NEXT:    sd s4, 8(sp) # 8-byte Folded Spill
164; RV64P670-NEXT:    .cfi_offset s0, -8
165; RV64P670-NEXT:    .cfi_offset s1, -16
166; RV64P670-NEXT:    .cfi_offset s2, -24
167; RV64P670-NEXT:    .cfi_offset s3, -32
168; RV64P670-NEXT:    .cfi_offset s4, -40
169; RV64P670-NEXT:    addi s1, a7, -1
170; RV64P670-NEXT:    add s0, a0, a6
171; RV64P670-NEXT:    li t0, 0
172; RV64P670-NEXT:    li t1, 0
173; RV64P670-NEXT:    zext.w s1, s1
174; RV64P670-NEXT:    mul t2, a1, s1
175; RV64P670-NEXT:    add t4, s0, t2
176; RV64P670-NEXT:    mul t2, a3, s1
177; RV64P670-NEXT:    add s0, a2, a6
178; RV64P670-NEXT:    mul s1, a5, s1
179; RV64P670-NEXT:    add t3, s0, t2
180; RV64P670-NEXT:    add s0, a4, a6
181; RV64P670-NEXT:    csrr t2, vlenb
182; RV64P670-NEXT:    add t5, s0, s1
183; RV64P670-NEXT:    sltu s1, a0, t3
184; RV64P670-NEXT:    sltu s0, a2, t4
185; RV64P670-NEXT:    slli t3, t2, 1
186; RV64P670-NEXT:    and s0, s0, s1
187; RV64P670-NEXT:    or s1, a1, a3
188; RV64P670-NEXT:    slti s1, s1, 0
189; RV64P670-NEXT:    or t6, s0, s1
190; RV64P670-NEXT:    sltu s1, a0, t5
191; RV64P670-NEXT:    sltu s0, a4, t4
192; RV64P670-NEXT:    mv t5, a0
193; RV64P670-NEXT:    and s0, s0, s1
194; RV64P670-NEXT:    or s1, a1, a5
195; RV64P670-NEXT:    slti s1, s1, 0
196; RV64P670-NEXT:    or s0, s0, s1
197; RV64P670-NEXT:    li s1, 32
198; RV64P670-NEXT:    maxu s1, t3, s1
199; RV64P670-NEXT:    or s0, t6, s0
200; RV64P670-NEXT:    sltu s1, a6, s1
201; RV64P670-NEXT:    or s0, s0, s1
202; RV64P670-NEXT:    andi t4, s0, 1
203; RV64P670-NEXT:    j .LBB0_4
204; RV64P670-NEXT:  .LBB0_3: # %for.cond1.for.cond.cleanup3_crit_edge.us
205; RV64P670-NEXT:    # in Loop: Header=BB0_4 Depth=1
206; RV64P670-NEXT:    add t5, t5, a1
207; RV64P670-NEXT:    add a2, a2, a3
208; RV64P670-NEXT:    add a4, a4, a5
209; RV64P670-NEXT:    addiw t1, t1, 1
210; RV64P670-NEXT:    addi t0, t0, 1
211; RV64P670-NEXT:    beq t1, a7, .LBB0_11
212; RV64P670-NEXT:  .LBB0_4: # %for.cond1.preheader.us
213; RV64P670-NEXT:    # =>This Loop Header: Depth=1
214; RV64P670-NEXT:    # Child Loop BB0_7 Depth 2
215; RV64P670-NEXT:    # Child Loop BB0_10 Depth 2
216; RV64P670-NEXT:    beqz t4, .LBB0_6
217; RV64P670-NEXT:  # %bb.5: # in Loop: Header=BB0_4 Depth=1
218; RV64P670-NEXT:    li t6, 0
219; RV64P670-NEXT:    j .LBB0_9
220; RV64P670-NEXT:  .LBB0_6: # %vector.ph
221; RV64P670-NEXT:    # in Loop: Header=BB0_4 Depth=1
222; RV64P670-NEXT:    slli s1, t2, 28
223; RV64P670-NEXT:    mv s2, a2
224; RV64P670-NEXT:    mv s3, a4
225; RV64P670-NEXT:    mv s4, t5
226; RV64P670-NEXT:    sub s1, s1, t3
227; RV64P670-NEXT:    vsetvli s0, zero, e8, m2, ta, ma
228; RV64P670-NEXT:    and t6, s1, a6
229; RV64P670-NEXT:    mv s1, t6
230; RV64P670-NEXT:  .LBB0_7: # %vector.body
231; RV64P670-NEXT:    # Parent Loop BB0_4 Depth=1
232; RV64P670-NEXT:    # => This Inner Loop Header: Depth=2
233; RV64P670-NEXT:    vl2r.v v8, (s2)
234; RV64P670-NEXT:    sub s1, s1, t3
235; RV64P670-NEXT:    add s2, s2, t3
236; RV64P670-NEXT:    vl2r.v v10, (s3)
237; RV64P670-NEXT:    add s3, s3, t3
238; RV64P670-NEXT:    vaaddu.vv v8, v8, v10
239; RV64P670-NEXT:    vs2r.v v8, (s4)
240; RV64P670-NEXT:    add s4, s4, t3
241; RV64P670-NEXT:    bnez s1, .LBB0_7
242; RV64P670-NEXT:  # %bb.8: # %middle.block
243; RV64P670-NEXT:    # in Loop: Header=BB0_4 Depth=1
244; RV64P670-NEXT:    beq t6, a6, .LBB0_3
245; RV64P670-NEXT:  .LBB0_9: # %for.body4.us.preheader
246; RV64P670-NEXT:    # in Loop: Header=BB0_4 Depth=1
247; RV64P670-NEXT:    mul s2, a1, t0
248; RV64P670-NEXT:    add s0, a0, a6
249; RV64P670-NEXT:    add s1, t5, t6
250; RV64P670-NEXT:    add s4, a4, t6
251; RV64P670-NEXT:    add t6, t6, a2
252; RV64P670-NEXT:    add s2, s2, s0
253; RV64P670-NEXT:  .LBB0_10: # %for.body4.us
254; RV64P670-NEXT:    # Parent Loop BB0_4 Depth=1
255; RV64P670-NEXT:    # => This Inner Loop Header: Depth=2
256; RV64P670-NEXT:    lbu s3, 0(t6)
257; RV64P670-NEXT:    lbu s0, 0(s4)
258; RV64P670-NEXT:    addi s4, s4, 1
259; RV64P670-NEXT:    addi t6, t6, 1
260; RV64P670-NEXT:    add s0, s0, s3
261; RV64P670-NEXT:    addi s0, s0, 1
262; RV64P670-NEXT:    srli s0, s0, 1
263; RV64P670-NEXT:    sb s0, 0(s1)
264; RV64P670-NEXT:    addi s1, s1, 1
265; RV64P670-NEXT:    bne s1, s2, .LBB0_10
266; RV64P670-NEXT:    j .LBB0_3
267; RV64P670-NEXT:  .LBB0_11:
268; RV64P670-NEXT:    ld s0, 40(sp) # 8-byte Folded Reload
269; RV64P670-NEXT:    ld s1, 32(sp) # 8-byte Folded Reload
270; RV64P670-NEXT:    ld s2, 24(sp) # 8-byte Folded Reload
271; RV64P670-NEXT:    ld s3, 16(sp) # 8-byte Folded Reload
272; RV64P670-NEXT:    ld s4, 8(sp) # 8-byte Folded Reload
273; RV64P670-NEXT:    .cfi_restore s0
274; RV64P670-NEXT:    .cfi_restore s1
275; RV64P670-NEXT:    .cfi_restore s2
276; RV64P670-NEXT:    .cfi_restore s3
277; RV64P670-NEXT:    .cfi_restore s4
278; RV64P670-NEXT:    addi sp, sp, 48
279; RV64P670-NEXT:    .cfi_def_cfa_offset 0
280; RV64P670-NEXT:  .LBB0_12: # %for.cond.cleanup
281; RV64P670-NEXT:    ret
282;
283; RV64X60-LABEL: test1:
284; RV64X60:       # %bb.0: # %entry
285; RV64X60-NEXT:    csrwi vxrm, 0
286; RV64X60-NEXT:    blez a7, .LBB0_12
287; RV64X60-NEXT:  # %bb.1: # %for.cond1.preheader.lr.ph
288; RV64X60-NEXT:    blez a6, .LBB0_12
289; RV64X60-NEXT:  # %bb.2: # %for.cond1.preheader.us.preheader
290; RV64X60-NEXT:    addi sp, sp, -48
291; RV64X60-NEXT:    .cfi_def_cfa_offset 48
292; RV64X60-NEXT:    sd s0, 40(sp) # 8-byte Folded Spill
293; RV64X60-NEXT:    sd s1, 32(sp) # 8-byte Folded Spill
294; RV64X60-NEXT:    sd s2, 24(sp) # 8-byte Folded Spill
295; RV64X60-NEXT:    sd s3, 16(sp) # 8-byte Folded Spill
296; RV64X60-NEXT:    sd s4, 8(sp) # 8-byte Folded Spill
297; RV64X60-NEXT:    .cfi_offset s0, -8
298; RV64X60-NEXT:    .cfi_offset s1, -16
299; RV64X60-NEXT:    .cfi_offset s2, -24
300; RV64X60-NEXT:    .cfi_offset s3, -32
301; RV64X60-NEXT:    .cfi_offset s4, -40
302; RV64X60-NEXT:    li t0, 0
303; RV64X60-NEXT:    li t1, 0
304; RV64X60-NEXT:    addi t2, a7, -1
305; RV64X60-NEXT:    add t4, a0, a6
306; RV64X60-NEXT:    add t5, a2, a6
307; RV64X60-NEXT:    add t3, a4, a6
308; RV64X60-NEXT:    zext.w s0, t2
309; RV64X60-NEXT:    mul s1, a1, s0
310; RV64X60-NEXT:    add t4, t4, s1
311; RV64X60-NEXT:    mul s1, a3, s0
312; RV64X60-NEXT:    add t5, t5, s1
313; RV64X60-NEXT:    csrr t2, vlenb
314; RV64X60-NEXT:    mul s1, a5, s0
315; RV64X60-NEXT:    add t3, t3, s1
316; RV64X60-NEXT:    sltu s1, a0, t5
317; RV64X60-NEXT:    sltu s0, a2, t4
318; RV64X60-NEXT:    and t6, s1, s0
319; RV64X60-NEXT:    li t5, 32
320; RV64X60-NEXT:    sltu s1, a0, t3
321; RV64X60-NEXT:    sltu s0, a4, t4
322; RV64X60-NEXT:    and t3, s1, s0
323; RV64X60-NEXT:    or s1, a1, a3
324; RV64X60-NEXT:    slti s1, s1, 0
325; RV64X60-NEXT:    or t4, t6, s1
326; RV64X60-NEXT:    or s0, a1, a5
327; RV64X60-NEXT:    slti s0, s0, 0
328; RV64X60-NEXT:    or s0, t3, s0
329; RV64X60-NEXT:    slli t3, t2, 1
330; RV64X60-NEXT:    maxu s1, t3, t5
331; RV64X60-NEXT:    or s0, t4, s0
332; RV64X60-NEXT:    sltu s1, a6, s1
333; RV64X60-NEXT:    or s0, s0, s1
334; RV64X60-NEXT:    andi t4, s0, 1
335; RV64X60-NEXT:    mv t5, a0
336; RV64X60-NEXT:    j .LBB0_4
337; RV64X60-NEXT:  .LBB0_3: # %for.cond1.for.cond.cleanup3_crit_edge.us
338; RV64X60-NEXT:    # in Loop: Header=BB0_4 Depth=1
339; RV64X60-NEXT:    add t5, t5, a1
340; RV64X60-NEXT:    add a2, a2, a3
341; RV64X60-NEXT:    add a4, a4, a5
342; RV64X60-NEXT:    addiw t1, t1, 1
343; RV64X60-NEXT:    addi t0, t0, 1
344; RV64X60-NEXT:    beq t1, a7, .LBB0_11
345; RV64X60-NEXT:  .LBB0_4: # %for.cond1.preheader.us
346; RV64X60-NEXT:    # =>This Loop Header: Depth=1
347; RV64X60-NEXT:    # Child Loop BB0_7 Depth 2
348; RV64X60-NEXT:    # Child Loop BB0_10 Depth 2
349; RV64X60-NEXT:    beqz t4, .LBB0_6
350; RV64X60-NEXT:  # %bb.5: # in Loop: Header=BB0_4 Depth=1
351; RV64X60-NEXT:    li t6, 0
352; RV64X60-NEXT:    j .LBB0_9
353; RV64X60-NEXT:  .LBB0_6: # %vector.ph
354; RV64X60-NEXT:    # in Loop: Header=BB0_4 Depth=1
355; RV64X60-NEXT:    slli s1, t2, 28
356; RV64X60-NEXT:    sub s1, s1, t3
357; RV64X60-NEXT:    and t6, s1, a6
358; RV64X60-NEXT:    mv s2, a2
359; RV64X60-NEXT:    mv s3, a4
360; RV64X60-NEXT:    mv s4, t5
361; RV64X60-NEXT:    mv s1, t6
362; RV64X60-NEXT:    vsetvli s0, zero, e8, m2, ta, ma
363; RV64X60-NEXT:  .LBB0_7: # %vector.body
364; RV64X60-NEXT:    # Parent Loop BB0_4 Depth=1
365; RV64X60-NEXT:    # => This Inner Loop Header: Depth=2
366; RV64X60-NEXT:    vl2r.v v8, (s2)
367; RV64X60-NEXT:    vl2r.v v10, (s3)
368; RV64X60-NEXT:    sub s1, s1, t3
369; RV64X60-NEXT:    add s3, s3, t3
370; RV64X60-NEXT:    vaaddu.vv v8, v8, v10
371; RV64X60-NEXT:    vs2r.v v8, (s4)
372; RV64X60-NEXT:    add s4, s4, t3
373; RV64X60-NEXT:    add s2, s2, t3
374; RV64X60-NEXT:    bnez s1, .LBB0_7
375; RV64X60-NEXT:  # %bb.8: # %middle.block
376; RV64X60-NEXT:    # in Loop: Header=BB0_4 Depth=1
377; RV64X60-NEXT:    beq t6, a6, .LBB0_3
378; RV64X60-NEXT:  .LBB0_9: # %for.body4.us.preheader
379; RV64X60-NEXT:    # in Loop: Header=BB0_4 Depth=1
380; RV64X60-NEXT:    mul s2, a1, t0
381; RV64X60-NEXT:    add s1, a0, a6
382; RV64X60-NEXT:    add s0, t5, t6
383; RV64X60-NEXT:    add s2, s2, s1
384; RV64X60-NEXT:    add s4, a4, t6
385; RV64X60-NEXT:    add t6, t6, a2
386; RV64X60-NEXT:  .LBB0_10: # %for.body4.us
387; RV64X60-NEXT:    # Parent Loop BB0_4 Depth=1
388; RV64X60-NEXT:    # => This Inner Loop Header: Depth=2
389; RV64X60-NEXT:    lbu s3, 0(t6)
390; RV64X60-NEXT:    lbu s1, 0(s4)
391; RV64X60-NEXT:    add s1, s1, s3
392; RV64X60-NEXT:    addi s1, s1, 1
393; RV64X60-NEXT:    srli s1, s1, 1
394; RV64X60-NEXT:    sb s1, 0(s0)
395; RV64X60-NEXT:    addi s0, s0, 1
396; RV64X60-NEXT:    addi s4, s4, 1
397; RV64X60-NEXT:    addi t6, t6, 1
398; RV64X60-NEXT:    bne s0, s2, .LBB0_10
399; RV64X60-NEXT:    j .LBB0_3
400; RV64X60-NEXT:  .LBB0_11:
401; RV64X60-NEXT:    ld s0, 40(sp) # 8-byte Folded Reload
402; RV64X60-NEXT:    ld s1, 32(sp) # 8-byte Folded Reload
403; RV64X60-NEXT:    ld s2, 24(sp) # 8-byte Folded Reload
404; RV64X60-NEXT:    ld s3, 16(sp) # 8-byte Folded Reload
405; RV64X60-NEXT:    ld s4, 8(sp) # 8-byte Folded Reload
406; RV64X60-NEXT:    .cfi_restore s0
407; RV64X60-NEXT:    .cfi_restore s1
408; RV64X60-NEXT:    .cfi_restore s2
409; RV64X60-NEXT:    .cfi_restore s3
410; RV64X60-NEXT:    .cfi_restore s4
411; RV64X60-NEXT:    addi sp, sp, 48
412; RV64X60-NEXT:    .cfi_def_cfa_offset 0
413; RV64X60-NEXT:  .LBB0_12: # %for.cond.cleanup
414; RV64X60-NEXT:    ret
415;
416; RV64-LABEL: test1:
417; RV64:       # %bb.0: # %entry
418; RV64-NEXT:    blez a7, .LBB0_14
419; RV64-NEXT:  # %bb.1: # %for.cond1.preheader.lr.ph
420; RV64-NEXT:    blez a6, .LBB0_14
421; RV64-NEXT:  # %bb.2: # %for.cond1.preheader.us.preheader
422; RV64-NEXT:    addi sp, sp, -48
423; RV64-NEXT:    .cfi_def_cfa_offset 48
424; RV64-NEXT:    sd s0, 40(sp) # 8-byte Folded Spill
425; RV64-NEXT:    sd s1, 32(sp) # 8-byte Folded Spill
426; RV64-NEXT:    sd s2, 24(sp) # 8-byte Folded Spill
427; RV64-NEXT:    sd s3, 16(sp) # 8-byte Folded Spill
428; RV64-NEXT:    sd s4, 8(sp) # 8-byte Folded Spill
429; RV64-NEXT:    .cfi_offset s0, -8
430; RV64-NEXT:    .cfi_offset s1, -16
431; RV64-NEXT:    .cfi_offset s2, -24
432; RV64-NEXT:    .cfi_offset s3, -32
433; RV64-NEXT:    .cfi_offset s4, -40
434; RV64-NEXT:    addi t1, a7, -1
435; RV64-NEXT:    add t5, a0, a6
436; RV64-NEXT:    add s0, a2, a6
437; RV64-NEXT:    add t6, a4, a6
438; RV64-NEXT:    csrr t0, vlenb
439; RV64-NEXT:    li t2, 32
440; RV64-NEXT:    slli t1, t1, 32
441; RV64-NEXT:    srli t3, t1, 32
442; RV64-NEXT:    mul t1, a1, t3
443; RV64-NEXT:    add t5, t5, t1
444; RV64-NEXT:    mul t1, a3, t3
445; RV64-NEXT:    add s0, s0, t1
446; RV64-NEXT:    slli t1, t0, 1
447; RV64-NEXT:    mul t3, a5, t3
448; RV64-NEXT:    add t6, t6, t3
449; RV64-NEXT:    mv t4, t1
450; RV64-NEXT:    bltu t2, t1, .LBB0_4
451; RV64-NEXT:  # %bb.3: # %for.cond1.preheader.us.preheader
452; RV64-NEXT:    li t4, 32
453; RV64-NEXT:  .LBB0_4: # %for.cond1.preheader.us.preheader
454; RV64-NEXT:    li t2, 0
455; RV64-NEXT:    li t3, 0
456; RV64-NEXT:    sltu s0, a0, s0
457; RV64-NEXT:    sltu s1, a2, t5
458; RV64-NEXT:    and s0, s0, s1
459; RV64-NEXT:    sltu t6, a0, t6
460; RV64-NEXT:    sltu t5, a4, t5
461; RV64-NEXT:    and t5, t6, t5
462; RV64-NEXT:    or t6, a1, a3
463; RV64-NEXT:    slti t6, t6, 0
464; RV64-NEXT:    or t6, s0, t6
465; RV64-NEXT:    or s0, a1, a5
466; RV64-NEXT:    slti s0, s0, 0
467; RV64-NEXT:    or t5, t5, s0
468; RV64-NEXT:    or t5, t6, t5
469; RV64-NEXT:    sltu t4, a6, t4
470; RV64-NEXT:    or t4, t4, t5
471; RV64-NEXT:    andi t4, t4, 1
472; RV64-NEXT:    mv t5, a0
473; RV64-NEXT:    j .LBB0_6
474; RV64-NEXT:  .LBB0_5: # %for.cond1.for.cond.cleanup3_crit_edge.us
475; RV64-NEXT:    # in Loop: Header=BB0_6 Depth=1
476; RV64-NEXT:    add t5, t5, a1
477; RV64-NEXT:    add a2, a2, a3
478; RV64-NEXT:    add a4, a4, a5
479; RV64-NEXT:    addiw t3, t3, 1
480; RV64-NEXT:    addi t2, t2, 1
481; RV64-NEXT:    beq t3, a7, .LBB0_13
482; RV64-NEXT:  .LBB0_6: # %for.cond1.preheader.us
483; RV64-NEXT:    # =>This Loop Header: Depth=1
484; RV64-NEXT:    # Child Loop BB0_9 Depth 2
485; RV64-NEXT:    # Child Loop BB0_12 Depth 2
486; RV64-NEXT:    beqz t4, .LBB0_8
487; RV64-NEXT:  # %bb.7: # in Loop: Header=BB0_6 Depth=1
488; RV64-NEXT:    li t6, 0
489; RV64-NEXT:    j .LBB0_11
490; RV64-NEXT:  .LBB0_8: # %vector.ph
491; RV64-NEXT:    # in Loop: Header=BB0_6 Depth=1
492; RV64-NEXT:    slli t6, t0, 28
493; RV64-NEXT:    sub t6, t6, t1
494; RV64-NEXT:    and t6, t6, a6
495; RV64-NEXT:    csrwi vxrm, 0
496; RV64-NEXT:    mv s0, a2
497; RV64-NEXT:    mv s1, a4
498; RV64-NEXT:    mv s2, t5
499; RV64-NEXT:    mv s3, t6
500; RV64-NEXT:    vsetvli s4, zero, e8, m2, ta, ma
501; RV64-NEXT:  .LBB0_9: # %vector.body
502; RV64-NEXT:    # Parent Loop BB0_6 Depth=1
503; RV64-NEXT:    # => This Inner Loop Header: Depth=2
504; RV64-NEXT:    vl2r.v v8, (s0)
505; RV64-NEXT:    vl2r.v v10, (s1)
506; RV64-NEXT:    sub s3, s3, t1
507; RV64-NEXT:    add s1, s1, t1
508; RV64-NEXT:    vaaddu.vv v8, v8, v10
509; RV64-NEXT:    vs2r.v v8, (s2)
510; RV64-NEXT:    add s2, s2, t1
511; RV64-NEXT:    add s0, s0, t1
512; RV64-NEXT:    bnez s3, .LBB0_9
513; RV64-NEXT:  # %bb.10: # %middle.block
514; RV64-NEXT:    # in Loop: Header=BB0_6 Depth=1
515; RV64-NEXT:    beq t6, a6, .LBB0_5
516; RV64-NEXT:  .LBB0_11: # %for.body4.us.preheader
517; RV64-NEXT:    # in Loop: Header=BB0_6 Depth=1
518; RV64-NEXT:    mul s1, a1, t2
519; RV64-NEXT:    add s2, a0, a6
520; RV64-NEXT:    add s0, t5, t6
521; RV64-NEXT:    add s1, s2, s1
522; RV64-NEXT:    add s2, a4, t6
523; RV64-NEXT:    add t6, a2, t6
524; RV64-NEXT:  .LBB0_12: # %for.body4.us
525; RV64-NEXT:    # Parent Loop BB0_6 Depth=1
526; RV64-NEXT:    # => This Inner Loop Header: Depth=2
527; RV64-NEXT:    lbu s3, 0(t6)
528; RV64-NEXT:    lbu s4, 0(s2)
529; RV64-NEXT:    add s3, s3, s4
530; RV64-NEXT:    addi s3, s3, 1
531; RV64-NEXT:    srli s3, s3, 1
532; RV64-NEXT:    sb s3, 0(s0)
533; RV64-NEXT:    addi s0, s0, 1
534; RV64-NEXT:    addi s2, s2, 1
535; RV64-NEXT:    addi t6, t6, 1
536; RV64-NEXT:    bne s0, s1, .LBB0_12
537; RV64-NEXT:    j .LBB0_5
538; RV64-NEXT:  .LBB0_13:
539; RV64-NEXT:    ld s0, 40(sp) # 8-byte Folded Reload
540; RV64-NEXT:    ld s1, 32(sp) # 8-byte Folded Reload
541; RV64-NEXT:    ld s2, 24(sp) # 8-byte Folded Reload
542; RV64-NEXT:    ld s3, 16(sp) # 8-byte Folded Reload
543; RV64-NEXT:    ld s4, 8(sp) # 8-byte Folded Reload
544; RV64-NEXT:    .cfi_restore s0
545; RV64-NEXT:    .cfi_restore s1
546; RV64-NEXT:    .cfi_restore s2
547; RV64-NEXT:    .cfi_restore s3
548; RV64-NEXT:    .cfi_restore s4
549; RV64-NEXT:    addi sp, sp, 48
550; RV64-NEXT:    .cfi_def_cfa_offset 0
551; RV64-NEXT:  .LBB0_14: # %for.cond.cleanup
552; RV64-NEXT:    ret
553entry:
554  %cmp29 = icmp sgt i32 %i_height, 0
555  br i1 %cmp29, label %for.cond1.preheader.lr.ph, label %for.cond.cleanup
556
557for.cond1.preheader.lr.ph:                        ; preds = %entry
558  %cmp227 = icmp sgt i32 %i_width, 0
559  %idx.ext = sext i32 %i_dst_stride to i64
560  %idx.ext12 = sext i32 %i_src1_stride to i64
561  %idx.ext14 = sext i32 %i_src2_stride to i64
562  br i1 %cmp227, label %for.cond1.preheader.us.preheader, label %for.cond.cleanup
563
564for.cond1.preheader.us.preheader:                 ; preds = %for.cond1.preheader.lr.ph
565  %wide.trip.count = zext nneg i32 %i_width to i64
566  %0 = add nsw i32 %i_height, -1
567  %1 = zext i32 %0 to i64
568  %2 = mul nsw i64 %idx.ext, %1
569  %3 = getelementptr i8, ptr %dst, i64 %2
570  %scevgep = getelementptr i8, ptr %3, i64 %wide.trip.count
571  %4 = mul nsw i64 %idx.ext12, %1
572  %5 = getelementptr i8, ptr %src1, i64 %4
573  %scevgep36 = getelementptr i8, ptr %5, i64 %wide.trip.count
574  %6 = mul nsw i64 %idx.ext14, %1
575  %7 = getelementptr i8, ptr %src2, i64 %6
576  %scevgep37 = getelementptr i8, ptr %7, i64 %wide.trip.count
577  %8 = tail call i64 @llvm.vscale.i64()
578  %9 = shl nuw nsw i64 %8, 4
579  %10 = tail call i64 @llvm.umax.i64(i64 %9, i64 32)
580  %min.iters.check = icmp ugt i64 %10, %wide.trip.count
581  %bound0 = icmp ult ptr %dst, %scevgep36
582  %bound1 = icmp ult ptr %src1, %scevgep
583  %found.conflict = and i1 %bound0, %bound1
584  %11 = or i32 %i_dst_stride, %i_src1_stride
585  %12 = icmp slt i32 %11, 0
586  %13 = or i1 %found.conflict, %12
587  %bound039 = icmp ult ptr %dst, %scevgep37
588  %bound140 = icmp ult ptr %src2, %scevgep
589  %found.conflict41 = and i1 %bound039, %bound140
590  %14 = or i32 %i_dst_stride, %i_src2_stride
591  %15 = icmp slt i32 %14, 0
592  %16 = or i1 %found.conflict41, %15
593  %conflict.rdx = or i1 %13, %16
594  br label %for.cond1.preheader.us
595
596for.cond1.preheader.us:                           ; preds = %for.cond1.preheader.us.preheader, %for.cond1.for.cond.cleanup3_crit_edge.us
597  %y.033.us = phi i32 [ %inc17.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
598  %dst.addr.032.us = phi ptr [ %add.ptr.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %dst, %for.cond1.preheader.us.preheader ]
599  %src1.addr.031.us = phi ptr [ %add.ptr13.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %src1, %for.cond1.preheader.us.preheader ]
600  %src2.addr.030.us = phi ptr [ %add.ptr15.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %src2, %for.cond1.preheader.us.preheader ]
601  %brmerge = select i1 %min.iters.check, i1 true, i1 %conflict.rdx
602  br i1 %brmerge, label %for.body4.us.preheader, label %vector.ph
603
604vector.ph:                                        ; preds = %for.cond1.preheader.us
605  %17 = tail call i64 @llvm.vscale.i64()
606  %.neg = mul nuw nsw i64 %17, 2147483632
607  %n.vec = and i64 %.neg, %wide.trip.count
608  %18 = tail call i64 @llvm.vscale.i64()
609  %19 = shl nuw nsw i64 %18, 4
610  br label %vector.body
611
612vector.body:                                      ; preds = %vector.body, %vector.ph
613  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
614  %20 = getelementptr inbounds i8, ptr %src1.addr.031.us, i64 %index
615  %wide.load = load <vscale x 16 x i8>, ptr %20, align 1
616  %21 = zext <vscale x 16 x i8> %wide.load to <vscale x 16 x i16>
617  %22 = getelementptr inbounds i8, ptr %src2.addr.030.us, i64 %index
618  %wide.load44 = load <vscale x 16 x i8>, ptr %22, align 1
619  %23 = zext <vscale x 16 x i8> %wide.load44 to <vscale x 16 x i16>
620  %24 = add nuw nsw <vscale x 16 x i16> %21, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
621  %25 = add nuw nsw <vscale x 16 x i16> %24, %23
622  %26 = lshr <vscale x 16 x i16> %25, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
623  %27 = trunc <vscale x 16 x i16> %26 to <vscale x 16 x i8>
624  %28 = getelementptr inbounds i8, ptr %dst.addr.032.us, i64 %index
625  store <vscale x 16 x i8> %27, ptr %28, align 1
626  %index.next = add nuw i64 %index, %19
627  %29 = icmp eq i64 %index.next, %n.vec
628  br i1 %29, label %middle.block, label %vector.body
629
630middle.block:                                     ; preds = %vector.body
631  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
632  br i1 %cmp.n, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.preheader
633
634for.body4.us.preheader:                           ; preds = %for.cond1.preheader.us, %middle.block
635  %indvars.iv.ph = phi i64 [ 0, %for.cond1.preheader.us ], [ %n.vec, %middle.block ]
636  br label %for.body4.us
637
638for.body4.us:                                     ; preds = %for.body4.us.preheader, %for.body4.us
639  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4.us ], [ %indvars.iv.ph, %for.body4.us.preheader ]
640  %arrayidx.us = getelementptr inbounds i8, ptr %src1.addr.031.us, i64 %indvars.iv
641  %30 = load i8, ptr %arrayidx.us, align 1
642  %conv.us = zext i8 %30 to i16
643  %arrayidx6.us = getelementptr inbounds i8, ptr %src2.addr.030.us, i64 %indvars.iv
644  %31 = load i8, ptr %arrayidx6.us, align 1
645  %conv7.us = zext i8 %31 to i16
646  %add.us = add nuw nsw i16 %conv.us, 1
647  %add8.us = add nuw nsw i16 %add.us, %conv7.us
648  %shr.us = lshr i16 %add8.us, 1
649  %conv9.us = trunc nuw i16 %shr.us to i8
650  %arrayidx11.us = getelementptr inbounds i8, ptr %dst.addr.032.us, i64 %indvars.iv
651  store i8 %conv9.us, ptr %arrayidx11.us, align 1
652  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
653  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
654  br i1 %exitcond.not, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us
655
656for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us, %middle.block
657  %add.ptr.us = getelementptr inbounds i8, ptr %dst.addr.032.us, i64 %idx.ext
658  %add.ptr13.us = getelementptr inbounds i8, ptr %src1.addr.031.us, i64 %idx.ext12
659  %add.ptr15.us = getelementptr inbounds i8, ptr %src2.addr.030.us, i64 %idx.ext14
660  %inc17.us = add nuw nsw i32 %y.033.us, 1
661  %exitcond35.not = icmp eq i32 %inc17.us, %i_height
662  br i1 %exitcond35.not, label %for.cond.cleanup, label %for.cond1.preheader.us
663
664for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.lr.ph, %entry
665  ret void
666}
667