xref: /llvm-project/llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll (revision 4e0bd3fab4b6a54342c9bed14f205895da3cf0d9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=aarch64 -hoist-const-loads=false < %s | FileCheck %s
3
4@A = external dso_local local_unnamed_addr global [8 x [8 x i64]], align 8
5@B = external dso_local local_unnamed_addr global [8 x [8 x i64]], align 8
6@C = external dso_local local_unnamed_addr global [8 x [8 x i64]], align 8
7
8define dso_local void @run_test() local_unnamed_addr uwtable {
9; CHECK-LABEL: run_test:
10; CHECK:       // %bb.0: // %entry
11; CHECK-NEXT:    sub sp, sp, #208
12; CHECK-NEXT:    .cfi_def_cfa_offset 208
13; CHECK-NEXT:    stp d15, d14, [sp, #96] // 16-byte Folded Spill
14; CHECK-NEXT:    stp d13, d12, [sp, #112] // 16-byte Folded Spill
15; CHECK-NEXT:    stp d11, d10, [sp, #128] // 16-byte Folded Spill
16; CHECK-NEXT:    stp d9, d8, [sp, #144] // 16-byte Folded Spill
17; CHECK-NEXT:    str x23, [sp, #160] // 8-byte Folded Spill
18; CHECK-NEXT:    stp x22, x21, [sp, #176] // 16-byte Folded Spill
19; CHECK-NEXT:    stp x20, x19, [sp, #192] // 16-byte Folded Spill
20; CHECK-NEXT:    .cfi_offset w19, -8
21; CHECK-NEXT:    .cfi_offset w20, -16
22; CHECK-NEXT:    .cfi_offset w21, -24
23; CHECK-NEXT:    .cfi_offset w22, -32
24; CHECK-NEXT:    .cfi_offset w23, -48
25; CHECK-NEXT:    .cfi_offset b8, -56
26; CHECK-NEXT:    .cfi_offset b9, -64
27; CHECK-NEXT:    .cfi_offset b10, -72
28; CHECK-NEXT:    .cfi_offset b11, -80
29; CHECK-NEXT:    .cfi_offset b12, -88
30; CHECK-NEXT:    .cfi_offset b13, -96
31; CHECK-NEXT:    .cfi_offset b14, -104
32; CHECK-NEXT:    .cfi_offset b15, -112
33; CHECK-NEXT:    movi v2.2d, #0000000000000000
34; CHECK-NEXT:    // implicit-def: $q1
35; CHECK-NEXT:    mov x8, xzr
36; CHECK-NEXT:    adrp x9, B+48
37; CHECK-NEXT:    add x9, x9, :lo12:B+48
38; CHECK-NEXT:    adrp x10, A
39; CHECK-NEXT:    add x10, x10, :lo12:A
40; CHECK-NEXT:    mov x11, xzr
41; CHECK-NEXT:    // kill: killed $q1
42; CHECK-NEXT:    // implicit-def: $q1
43; CHECK-NEXT:    mov x12, xzr
44; CHECK-NEXT:    // implicit-def: $q0
45; CHECK-NEXT:    // implicit-def: $q3
46; CHECK-NEXT:    // implicit-def: $q4
47; CHECK-NEXT:    // implicit-def: $q5
48; CHECK-NEXT:    // implicit-def: $q7
49; CHECK-NEXT:    // implicit-def: $q10
50; CHECK-NEXT:    // implicit-def: $q17
51; CHECK-NEXT:    // implicit-def: $q6
52; CHECK-NEXT:    // implicit-def: $q18
53; CHECK-NEXT:    // implicit-def: $q19
54; CHECK-NEXT:    // implicit-def: $q20
55; CHECK-NEXT:    // implicit-def: $q21
56; CHECK-NEXT:    // implicit-def: $q22
57; CHECK-NEXT:    // implicit-def: $q23
58; CHECK-NEXT:    // implicit-def: $q24
59; CHECK-NEXT:    // implicit-def: $q9
60; CHECK-NEXT:    // implicit-def: $q27
61; CHECK-NEXT:    // implicit-def: $q12
62; CHECK-NEXT:    // implicit-def: $q28
63; CHECK-NEXT:    // implicit-def: $q14
64; CHECK-NEXT:    // implicit-def: $q15
65; CHECK-NEXT:    // implicit-def: $q29
66; CHECK-NEXT:    // implicit-def: $q30
67; CHECK-NEXT:    // implicit-def: $q11
68; CHECK-NEXT:    // implicit-def: $q31
69; CHECK-NEXT:    // implicit-def: $q13
70; CHECK-NEXT:    // kill: killed $q1
71; CHECK-NEXT:    // implicit-def: $q1
72; CHECK-NEXT:    // kill: killed $q1
73; CHECK-NEXT:  .LBB0_1: // %for.cond1.preheader
74; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
75; CHECK-NEXT:    stp q29, q15, [sp] // 32-byte Folded Spill
76; CHECK-NEXT:    ldr q15, [x8]
77; CHECK-NEXT:    ldr x15, [x8]
78; CHECK-NEXT:    str q14, [sp, #32] // 16-byte Folded Spill
79; CHECK-NEXT:    add x20, x10, x11
80; CHECK-NEXT:    mov v8.16b, v28.16b
81; CHECK-NEXT:    fmov x2, d15
82; CHECK-NEXT:    mov x17, v15.d[1]
83; CHECK-NEXT:    ldr q14, [x8]
84; CHECK-NEXT:    mov v28.16b, v24.16b
85; CHECK-NEXT:    mov v24.16b, v20.16b
86; CHECK-NEXT:    mov v20.16b, v17.16b
87; CHECK-NEXT:    fmov x13, d14
88; CHECK-NEXT:    mov x16, v14.d[1]
89; CHECK-NEXT:    mov v17.16b, v5.16b
90; CHECK-NEXT:    mul x3, x2, x15
91; CHECK-NEXT:    ldr q14, [x9], #64
92; CHECK-NEXT:    ldr q5, [sp, #64] // 16-byte Folded Reload
93; CHECK-NEXT:    ldr x6, [x8]
94; CHECK-NEXT:    ldr x20, [x20, #128]
95; CHECK-NEXT:    mul x1, x17, x15
96; CHECK-NEXT:    mov x14, v14.d[1]
97; CHECK-NEXT:    fmov x5, d14
98; CHECK-NEXT:    mov v29.16b, v21.16b
99; CHECK-NEXT:    mov v21.16b, v0.16b
100; CHECK-NEXT:    mov v25.16b, v6.16b
101; CHECK-NEXT:    mul x18, x13, x15
102; CHECK-NEXT:    mov v6.16b, v2.16b
103; CHECK-NEXT:    mov v26.16b, v22.16b
104; CHECK-NEXT:    fmov d15, x3
105; CHECK-NEXT:    mov v22.16b, v18.16b
106; CHECK-NEXT:    mov v18.16b, v7.16b
107; CHECK-NEXT:    mul x0, x16, x15
108; CHECK-NEXT:    mov v7.16b, v3.16b
109; CHECK-NEXT:    mov v16.16b, v4.16b
110; CHECK-NEXT:    add x11, x11, #8
111; CHECK-NEXT:    add x12, x12, #1
112; CHECK-NEXT:    mov v15.d[1], x1
113; CHECK-NEXT:    mul x4, x14, x15
114; CHECK-NEXT:    cmp x11, #64
115; CHECK-NEXT:    fmov d14, x18
116; CHECK-NEXT:    mul x15, x5, x15
117; CHECK-NEXT:    add v5.2d, v5.2d, v15.2d
118; CHECK-NEXT:    mul x21, x2, x6
119; CHECK-NEXT:    mov v14.d[1], x0
120; CHECK-NEXT:    mul x2, x2, x20
121; CHECK-NEXT:    fmov d0, x15
122; CHECK-NEXT:    str q5, [sp, #64] // 16-byte Folded Spill
123; CHECK-NEXT:    ldr q5, [sp, #48] // 16-byte Folded Reload
124; CHECK-NEXT:    mul x22, x13, x20
125; CHECK-NEXT:    add v5.2d, v5.2d, v14.2d
126; CHECK-NEXT:    fmov d3, x21
127; CHECK-NEXT:    mul x19, x17, x6
128; CHECK-NEXT:    mov v0.d[1], x4
129; CHECK-NEXT:    fmov d1, x2
130; CHECK-NEXT:    mul x17, x17, x20
131; CHECK-NEXT:    str q5, [sp, #48] // 16-byte Folded Spill
132; CHECK-NEXT:    add v5.2d, v13.2d, v14.2d
133; CHECK-NEXT:    fmov d2, x22
134; CHECK-NEXT:    ldr q13, [sp, #80] // 16-byte Folded Reload
135; CHECK-NEXT:    mul x7, x16, x6
136; CHECK-NEXT:    ldp q15, q14, [sp, #16] // 32-byte Folded Reload
137; CHECK-NEXT:    mov v3.d[1], x19
138; CHECK-NEXT:    add v13.2d, v13.2d, v0.2d
139; CHECK-NEXT:    mul x16, x16, x20
140; CHECK-NEXT:    mov v1.d[1], x17
141; CHECK-NEXT:    mul x23, x5, x20
142; CHECK-NEXT:    str q13, [sp, #80] // 16-byte Folded Spill
143; CHECK-NEXT:    mov v13.16b, v5.16b
144; CHECK-NEXT:    mov v5.16b, v17.16b
145; CHECK-NEXT:    mov v17.16b, v20.16b
146; CHECK-NEXT:    mov v20.16b, v24.16b
147; CHECK-NEXT:    mul x13, x13, x6
148; CHECK-NEXT:    mov v24.16b, v28.16b
149; CHECK-NEXT:    add v11.2d, v11.2d, v3.2d
150; CHECK-NEXT:    mov v2.d[1], x16
151; CHECK-NEXT:    add v15.2d, v15.2d, v1.2d
152; CHECK-NEXT:    add v27.2d, v27.2d, v3.2d
153; CHECK-NEXT:    mul x18, x14, x20
154; CHECK-NEXT:    add v23.2d, v23.2d, v3.2d
155; CHECK-NEXT:    add v19.2d, v19.2d, v3.2d
156; CHECK-NEXT:    fmov d4, x23
157; CHECK-NEXT:    add v10.2d, v10.2d, v3.2d
158; CHECK-NEXT:    mul x15, x5, x6
159; CHECK-NEXT:    fmov d0, x13
160; CHECK-NEXT:    add v14.2d, v14.2d, v2.2d
161; CHECK-NEXT:    add v2.2d, v6.2d, v3.2d
162; CHECK-NEXT:    mul x14, x14, x6
163; CHECK-NEXT:    mov v3.16b, v7.16b
164; CHECK-NEXT:    mov v7.16b, v18.16b
165; CHECK-NEXT:    mov v4.d[1], x18
166; CHECK-NEXT:    mov v18.16b, v22.16b
167; CHECK-NEXT:    mov v0.d[1], x7
168; CHECK-NEXT:    fmov d1, x15
169; CHECK-NEXT:    add v28.2d, v8.2d, v4.2d
170; CHECK-NEXT:    mov v1.d[1], x14
171; CHECK-NEXT:    add v31.2d, v31.2d, v0.2d
172; CHECK-NEXT:    add v30.2d, v30.2d, v0.2d
173; CHECK-NEXT:    add v12.2d, v12.2d, v0.2d
174; CHECK-NEXT:    add v24.2d, v24.2d, v0.2d
175; CHECK-NEXT:    add v22.2d, v26.2d, v0.2d
176; CHECK-NEXT:    add v20.2d, v20.2d, v0.2d
177; CHECK-NEXT:    add v18.2d, v18.2d, v0.2d
178; CHECK-NEXT:    add v17.2d, v17.2d, v0.2d
179; CHECK-NEXT:    add v7.2d, v7.2d, v0.2d
180; CHECK-NEXT:    add v4.2d, v16.2d, v0.2d
181; CHECK-NEXT:    add v3.2d, v3.2d, v0.2d
182; CHECK-NEXT:    mov v0.16b, v21.16b
183; CHECK-NEXT:    mov v21.16b, v29.16b
184; CHECK-NEXT:    ldr q29, [sp] // 16-byte Folded Reload
185; CHECK-NEXT:    add v9.2d, v9.2d, v1.2d
186; CHECK-NEXT:    add v6.2d, v25.2d, v1.2d
187; CHECK-NEXT:    add v5.2d, v5.2d, v1.2d
188; CHECK-NEXT:    add v29.2d, v29.2d, v1.2d
189; CHECK-NEXT:    add v21.2d, v21.2d, v1.2d
190; CHECK-NEXT:    add v0.2d, v0.2d, v1.2d
191; CHECK-NEXT:    b.ne .LBB0_1
192; CHECK-NEXT:  // %bb.2: // %for.cond.cleanup
193; CHECK-NEXT:    ldr q1, [sp, #48] // 16-byte Folded Reload
194; CHECK-NEXT:    adrp x8, C
195; CHECK-NEXT:    add x8, x8, :lo12:C
196; CHECK-NEXT:    stp q11, q30, [x8, #80]
197; CHECK-NEXT:    ldp x20, x19, [sp, #192] // 16-byte Folded Reload
198; CHECK-NEXT:    str q1, [x8]
199; CHECK-NEXT:    ldr q1, [sp, #64] // 16-byte Folded Reload
200; CHECK-NEXT:    ldr x23, [sp, #160] // 8-byte Folded Reload
201; CHECK-NEXT:    stp q15, q14, [x8, #144]
202; CHECK-NEXT:    ldp x22, x21, [sp, #176] // 16-byte Folded Reload
203; CHECK-NEXT:    stp q1, q13, [x8, #16]
204; CHECK-NEXT:    ldr q1, [sp, #80] // 16-byte Folded Reload
205; CHECK-NEXT:    stp q28, q12, [x8, #176]
206; CHECK-NEXT:    ldp d13, d12, [sp, #112] // 16-byte Folded Reload
207; CHECK-NEXT:    stp q1, q31, [x8, #48]
208; CHECK-NEXT:    ldp d15, d14, [sp, #96] // 16-byte Folded Reload
209; CHECK-NEXT:    stp q9, q24, [x8, #240]
210; CHECK-NEXT:    ldp d9, d8, [sp, #144] // 16-byte Folded Reload
211; CHECK-NEXT:    stp q19, q18, [x8, #336]
212; CHECK-NEXT:    stp q10, q7, [x8, #400]
213; CHECK-NEXT:    ldp d11, d10, [sp, #128] // 16-byte Folded Reload
214; CHECK-NEXT:    str q29, [x8, #112]
215; CHECK-NEXT:    str q27, [x8, #208]
216; CHECK-NEXT:    stp q23, q22, [x8, #272]
217; CHECK-NEXT:    stp q21, q20, [x8, #304]
218; CHECK-NEXT:    stp q6, q17, [x8, #368]
219; CHECK-NEXT:    stp q5, q4, [x8, #432]
220; CHECK-NEXT:    stp q2, q3, [x8, #464]
221; CHECK-NEXT:    str q0, [x8, #496]
222; CHECK-NEXT:    add sp, sp, #208
223; CHECK-NEXT:    .cfi_def_cfa_offset 0
224; CHECK-NEXT:    .cfi_restore w19
225; CHECK-NEXT:    .cfi_restore w20
226; CHECK-NEXT:    .cfi_restore w21
227; CHECK-NEXT:    .cfi_restore w22
228; CHECK-NEXT:    .cfi_restore w23
229; CHECK-NEXT:    .cfi_restore b8
230; CHECK-NEXT:    .cfi_restore b9
231; CHECK-NEXT:    .cfi_restore b10
232; CHECK-NEXT:    .cfi_restore b11
233; CHECK-NEXT:    .cfi_restore b12
234; CHECK-NEXT:    .cfi_restore b13
235; CHECK-NEXT:    .cfi_restore b14
236; CHECK-NEXT:    .cfi_restore b15
237; CHECK-NEXT:    ret
238; CH`ECK-NEXT:    .cfi_offset b9, -16
239entry:
240  br label %for.cond1.preheader
241
242for.cond1.preheader:                              ; preds = %for.cond1.preheader, %entry
243  %0 = phi <2 x i64> [ undef, %entry ], [ %118, %for.cond1.preheader ]
244  %1 = phi <2 x i64> [ undef, %entry ], [ %116, %for.cond1.preheader ]
245  %2 = phi <2 x i64> [ zeroinitializer, %entry ], [ %114, %for.cond1.preheader ]
246  %3 = phi <2 x i64> [ undef, %entry ], [ %112, %for.cond1.preheader ]
247  %4 = phi <2 x i64> [ undef, %entry ], [ %107, %for.cond1.preheader ]
248  %5 = phi <2 x i64> [ undef, %entry ], [ %105, %for.cond1.preheader ]
249  %6 = phi <2 x i64> [ undef, %entry ], [ %103, %for.cond1.preheader ]
250  %7 = phi <2 x i64> [ undef, %entry ], [ %101, %for.cond1.preheader ]
251  %8 = phi <2 x i64> [ undef, %entry ], [ %96, %for.cond1.preheader ]
252  %9 = phi <2 x i64> [ undef, %entry ], [ %94, %for.cond1.preheader ]
253  %10 = phi <2 x i64> [ undef, %entry ], [ %92, %for.cond1.preheader ]
254  %11 = phi <2 x i64> [ undef, %entry ], [ %90, %for.cond1.preheader ]
255  %12 = phi <2 x i64> [ undef, %entry ], [ %85, %for.cond1.preheader ]
256  %13 = phi <2 x i64> [ undef, %entry ], [ %83, %for.cond1.preheader ]
257  %14 = phi <2 x i64> [ undef, %entry ], [ %81, %for.cond1.preheader ]
258  %15 = phi <2 x i64> [ undef, %entry ], [ %79, %for.cond1.preheader ]
259  %16 = phi <2 x i64> [ undef, %entry ], [ %74, %for.cond1.preheader ]
260  %17 = phi <2 x i64> [ undef, %entry ], [ %72, %for.cond1.preheader ]
261  %18 = phi <2 x i64> [ undef, %entry ], [ %70, %for.cond1.preheader ]
262  %19 = phi <2 x i64> [ undef, %entry ], [ %65, %for.cond1.preheader ]
263  %20 = phi <2 x i64> [ undef, %entry ], [ %63, %for.cond1.preheader ]
264  %21 = phi <2 x i64> [ undef, %entry ], [ %61, %for.cond1.preheader ]
265  %22 = phi <2 x i64> [ undef, %entry ], [ %56, %for.cond1.preheader ]
266  %23 = phi <2 x i64> [ undef, %entry ], [ %54, %for.cond1.preheader ]
267  %24 = phi <2 x i64> [ undef, %entry ], [ %52, %for.cond1.preheader ]
268  %25 = phi <2 x i64> [ undef, %entry ], [ %50, %for.cond1.preheader ]
269  %26 = phi <2 x i64> [ undef, %entry ], [ %45, %for.cond1.preheader ]
270  %27 = phi <2 x i64> [ undef, %entry ], [ %43, %for.cond1.preheader ]
271  %28 = phi <2 x i64> [ undef, %entry ], [ %41, %for.cond1.preheader ]
272  %29 = phi <2 x i64> [ undef, %entry ], [ %39, %for.cond1.preheader ]
273  %indvars.iv40 = phi i64 [ 0, %entry ], [ %indvars.iv.next41, %for.cond1.preheader ]
274  %30 = load <2 x i64>, ptr null, align 8
275  %31 = load <2 x i64>, ptr undef, align 8
276  %arrayidx14.4.phi.trans.insert = getelementptr inbounds [8 x [8 x i64]], ptr @B, i64 0, i64 %indvars.iv40, i64 4
277  %32 = load <2 x i64>, ptr null, align 8
278  %arrayidx14.6.phi.trans.insert = getelementptr inbounds [8 x [8 x i64]], ptr @B, i64 0, i64 %indvars.iv40, i64 6
279  %33 = bitcast ptr %arrayidx14.6.phi.trans.insert to ptr
280  %34 = load <2 x i64>, ptr %33, align 8
281  %35 = load i64, ptr null, align 8
282  %36 = insertelement <2 x i64> undef, i64 %35, i32 0
283  %37 = shufflevector <2 x i64> %36, <2 x i64> undef, <2 x i32> zeroinitializer
284  %38 = mul nsw <2 x i64> %30, %37
285  %39 = add nsw <2 x i64> %29, %38
286  %40 = mul nsw <2 x i64> %31, %37
287  %41 = add nsw <2 x i64> %28, %40
288  %42 = mul nsw <2 x i64> %32, %37
289  %43 = add nsw <2 x i64> %27, %42
290  %44 = mul nsw <2 x i64> %34, %37
291  %45 = add nsw <2 x i64> %26, %44
292  %46 = load i64, ptr undef, align 8
293  %47 = insertelement <2 x i64> undef, i64 %46, i32 0
294  %48 = shufflevector <2 x i64> %47, <2 x i64> undef, <2 x i32> zeroinitializer
295  %49 = mul nsw <2 x i64> %30, %48
296  %50 = add nsw <2 x i64> %25, %49
297  %51 = mul nsw <2 x i64> %31, %48
298  %52 = add nsw <2 x i64> %24, %51
299  %53 = mul nsw <2 x i64> %32, %48
300  %54 = add nsw <2 x i64> %23, %53
301  %55 = mul nsw <2 x i64> %34, %48
302  %56 = add nsw <2 x i64> %22, %55
303  %arrayidx10.2 = getelementptr inbounds [8 x [8 x i64]], ptr @A, i64 0, i64 2, i64 %indvars.iv40
304  %57 = load i64, ptr %arrayidx10.2, align 8
305  %58 = insertelement <2 x i64> undef, i64 %57, i32 0
306  %59 = shufflevector <2 x i64> %58, <2 x i64> undef, <2 x i32> zeroinitializer
307  %60 = mul nsw <2 x i64> %31, %59
308  %61 = add nsw <2 x i64> %21, %60
309  %62 = mul nsw <2 x i64> %32, %59
310  %63 = add nsw <2 x i64> %20, %62
311  %64 = mul nsw <2 x i64> %34, %59
312  %65 = add nsw <2 x i64> %19, %64
313  %66 = load i64, ptr undef, align 8
314  %67 = insertelement <2 x i64> undef, i64 %66, i32 0
315  %68 = shufflevector <2 x i64> %67, <2 x i64> undef, <2 x i32> zeroinitializer
316  %69 = mul nsw <2 x i64> %30, %68
317  %70 = add nsw <2 x i64> %18, %69
318  %71 = mul nsw <2 x i64> %31, %68
319  %72 = add nsw <2 x i64> %17, %71
320  %73 = mul nsw <2 x i64> %34, %68
321  %74 = add nsw <2 x i64> %16, %73
322  %75 = load i64, ptr undef, align 8
323  %76 = insertelement <2 x i64> undef, i64 %75, i32 0
324  %77 = shufflevector <2 x i64> %76, <2 x i64> undef, <2 x i32> zeroinitializer
325  %78 = mul nsw <2 x i64> %30, %77
326  %79 = add nsw <2 x i64> %15, %78
327  %80 = mul nsw <2 x i64> %31, %77
328  %81 = add nsw <2 x i64> %14, %80
329  %82 = mul nsw <2 x i64> %32, %77
330  %83 = add nsw <2 x i64> %13, %82
331  %84 = mul nsw <2 x i64> %34, %77
332  %85 = add nsw <2 x i64> %12, %84
333  %86 = load i64, ptr undef, align 8
334  %87 = insertelement <2 x i64> undef, i64 %86, i32 0
335  %88 = shufflevector <2 x i64> %87, <2 x i64> undef, <2 x i32> zeroinitializer
336  %89 = mul nsw <2 x i64> %30, %88
337  %90 = add nsw <2 x i64> %11, %89
338  %91 = mul nsw <2 x i64> %31, %88
339  %92 = add nsw <2 x i64> %10, %91
340  %93 = mul nsw <2 x i64> %32, %88
341  %94 = add nsw <2 x i64> %9, %93
342  %95 = mul nsw <2 x i64> %34, %88
343  %96 = add nsw <2 x i64> %8, %95
344  %97 = load i64, ptr undef, align 8
345  %98 = insertelement <2 x i64> undef, i64 %97, i32 0
346  %99 = shufflevector <2 x i64> %98, <2 x i64> undef, <2 x i32> zeroinitializer
347  %100 = mul nsw <2 x i64> %30, %99
348  %101 = add nsw <2 x i64> %7, %100
349  %102 = mul nsw <2 x i64> %31, %99
350  %103 = add nsw <2 x i64> %6, %102
351  %104 = mul nsw <2 x i64> %32, %99
352  %105 = add nsw <2 x i64> %5, %104
353  %106 = mul nsw <2 x i64> %34, %99
354  %107 = add nsw <2 x i64> %4, %106
355  %108 = load i64, ptr undef, align 8
356  %109 = insertelement <2 x i64> undef, i64 %108, i32 0
357  %110 = shufflevector <2 x i64> %109, <2 x i64> undef, <2 x i32> zeroinitializer
358  %111 = mul nsw <2 x i64> %30, %110
359  %112 = add nsw <2 x i64> %3, %111
360  %113 = mul nsw <2 x i64> %31, %110
361  %114 = add nsw <2 x i64> %2, %113
362  %115 = mul nsw <2 x i64> %32, %110
363  %116 = add nsw <2 x i64> %1, %115
364  %117 = mul nsw <2 x i64> %34, %110
365  %118 = add nsw <2 x i64> %0, %117
366  %indvars.iv.next41 = add nuw nsw i64 %indvars.iv40, 1
367  %exitcond42 = icmp eq i64 %indvars.iv.next41, 8
368  br i1 %exitcond42, label %for.cond.cleanup, label %for.cond1.preheader
369
370for.cond.cleanup:                                 ; preds = %for.cond1.preheader
371  store <2 x i64> %39, ptr @C, align 8
372  store <2 x i64> %41, ptr getelementptr inbounds ([8 x [8 x i64]], ptr @C, i64 0, i64 0, i64 2), align 8
373  store <2 x i64> %43, ptr getelementptr inbounds ([8 x [8 x i64]], ptr @C, i64 0, i64 0, i64 4), align 8
374  store <2 x i64> %45, ptr getelementptr inbounds ([8 x [8 x i64]], ptr @C, i64 0, i64 0, i64 6), align 8
375  store <2 x i64> %50, ptr getelementptr inbounds ([8 x [8 x i64]], ptr @C, i64 0, i64 1, i64 0), align 8
376  store <2 x i64> %52, ptr getelementptr inbounds ([8 x [8 x i64]], ptr @C, i64 0, i64 1, i64 2), align 8
377  store <2 x i64> %54, ptr getelementptr inbounds ([8 x [8 x i64]], ptr @C, i64 0, i64 1, i64 4), align 8
378  store <2 x i64> %56, ptr getelementptr inbounds ([8 x [8 x i64]], ptr @C, i64 0, i64 1, i64 6), align 8
379  store <2 x i64> %61, ptr getelementptr inbounds ([8 x [8 x i64]], ptr @C, i64 0, i64 2, i64 2), align 8
380  store <2 x i64> %63, ptr getelementptr inbounds ([8 x [8 x i64]], ptr @C, i64 0, i64 2, i64 4), align 8
381  store <2 x i64> %65, ptr getelementptr inbounds ([8 x [8 x i64]], ptr @C, i64 0, i64 2, i64 6), align 8
382  store <2 x i64> %70, ptr getelementptr inbounds ([8 x [8 x i64]], ptr @C, i64 0, i64 3, i64 0), align 8
383  store <2 x i64> %72, ptr getelementptr inbounds ([8 x [8 x i64]], ptr @C, i64 0, i64 3, i64 2), align 8
384  store <2 x i64> %74, ptr getelementptr inbounds ([8 x [8 x i64]], ptr @C, i64 0, i64 3, i64 6), align 8
385  store <2 x i64> %79, ptr getelementptr inbounds ([8 x [8 x i64]], ptr @C, i64 0, i64 4, i64 0), align 8
386  store <2 x i64> %81, ptr getelementptr inbounds ([8 x [8 x i64]], ptr @C, i64 0, i64 4, i64 2), align 8
387  store <2 x i64> %83, ptr getelementptr inbounds ([8 x [8 x i64]], ptr @C, i64 0, i64 4, i64 4), align 8
388  store <2 x i64> %85, ptr getelementptr inbounds ([8 x [8 x i64]], ptr @C, i64 0, i64 4, i64 6), align 8
389  store <2 x i64> %90, ptr getelementptr inbounds ([8 x [8 x i64]], ptr @C, i64 0, i64 5, i64 0), align 8
390  store <2 x i64> %92, ptr getelementptr inbounds ([8 x [8 x i64]], ptr @C, i64 0, i64 5, i64 2), align 8
391  store <2 x i64> %94, ptr getelementptr inbounds ([8 x [8 x i64]], ptr @C, i64 0, i64 5, i64 4), align 8
392  store <2 x i64> %96, ptr getelementptr inbounds ([8 x [8 x i64]], ptr @C, i64 0, i64 5, i64 6), align 8
393  store <2 x i64> %101, ptr getelementptr inbounds ([8 x [8 x i64]], ptr @C, i64 0, i64 6, i64 0), align 8
394  store <2 x i64> %103, ptr getelementptr inbounds ([8 x [8 x i64]], ptr @C, i64 0, i64 6, i64 2), align 8
395  store <2 x i64> %105, ptr getelementptr inbounds ([8 x [8 x i64]], ptr @C, i64 0, i64 6, i64 4), align 8
396  store <2 x i64> %107, ptr getelementptr inbounds ([8 x [8 x i64]], ptr @C, i64 0, i64 6, i64 6), align 8
397  store <2 x i64> %112, ptr getelementptr inbounds ([8 x [8 x i64]], ptr @C, i64 0, i64 7, i64 0), align 8
398  store <2 x i64> %114, ptr getelementptr inbounds ([8 x [8 x i64]], ptr @C, i64 0, i64 7, i64 2), align 8
399  store <2 x i64> %116, ptr getelementptr inbounds ([8 x [8 x i64]], ptr @C, i64 0, i64 7, i64 4), align 8
400  store <2 x i64> %118, ptr getelementptr inbounds ([8 x [8 x i64]], ptr @C, i64 0, i64 7, i64 6), align 8
401  ret void
402}
403