1; RUN: llc -mtriple x86_64-apple-macosx -mcpu=corei7-avx -combiner-stress-load-slicing < %s -o - | FileCheck %s --check-prefix=STRESS 2; RUN: llc -mtriple x86_64-apple-macosx -mcpu=corei7-avx < %s -o - | FileCheck %s --check-prefix=REGULAR 3; 4; <rdar://problem/14477220> 5 6%class.Complex = type { float, float } 7 8 9; Check that independent slices leads to independent loads then the slices leads to 10; different register file. 11; 12; The layout is: 13; LSB 0 1 2 3 | 4 5 6 7 MSB 14; Low High 15; The base address points to 0 and is 8-bytes aligned. 16; Low slice starts at 0 (base) and is 8-bytes aligned. 17; High slice starts at 4 (base + 4-bytes) and is 4-bytes aligned. 18; 19; STRESS-LABEL: _t1: 20; Load out[out_start + 8].real, this is base + 8 * 8 + 0. 21; STRESS: vmovss 64([[BASE:[^(]+]]), [[OUT_Real:%xmm[0-9]+]] 22; Load out[out_start + 8].imm, this is base + 8 * 8 + 4. 23; STRESS-NEXT: vmovss 68([[BASE]]), [[OUT_Imm:%xmm[0-9]+]] 24; Add low slice: out[out_start].real, this is base + 0. 25; STRESS-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]] 26; Add high slice: out[out_start].imm, this is base + 4. 27; STRESS-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]] 28; Swap Imm and Real. 29; STRESS-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]] 30; Put the results back into out[out_start]. 31; STRESS-NEXT: vmovlps [[RES_Vec]], ([[BASE]]) 32; 33; Same for REGULAR, we eliminate register bank copy with each slices. 34; REGULAR-LABEL: _t1: 35; Load out[out_start + 8].real, this is base + 8 * 8 + 0. 36; REGULAR: vmovss 64([[BASE:[^)]+]]), [[OUT_Real:%xmm[0-9]+]] 37; Load out[out_start + 8].imm, this is base + 8 * 8 + 4. 38; REGULAR-NEXT: vmovss 68([[BASE]]), [[OUT_Imm:%xmm[0-9]+]] 39; Add low slice: out[out_start].real, this is base + 0. 40; REGULAR-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]] 41; Add high slice: out[out_start].imm, this is base + 4. 42; REGULAR-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]] 43; Swap Imm and Real. 44; REGULAR-NEXT: vinsertps $16, [[RES_Imm]], [[RES_Real]], [[RES_Vec:%xmm[0-9]+]] 45; Put the results back into out[out_start]. 46; REGULAR-NEXT: vmovlps [[RES_Vec]], ([[BASE]]) 47define void @t1(ptr nocapture %out, i64 %out_start) { 48entry: 49 %arrayidx = getelementptr inbounds %class.Complex, ptr %out, i64 %out_start 50 %tmp1 = load i64, ptr %arrayidx, align 8 51 %t0.sroa.0.0.extract.trunc = trunc i64 %tmp1 to i32 52 %tmp2 = bitcast i32 %t0.sroa.0.0.extract.trunc to float 53 %t0.sroa.2.0.extract.shift = lshr i64 %tmp1, 32 54 %t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32 55 %tmp3 = bitcast i32 %t0.sroa.2.0.extract.trunc to float 56 %add = add i64 %out_start, 8 57 %arrayidx2 = getelementptr inbounds %class.Complex, ptr %out, i64 %add 58 %tmp4 = load float, ptr %arrayidx2, align 4 59 %add.i = fadd float %tmp4, %tmp2 60 %retval.sroa.0.0.vec.insert.i = insertelement <2 x float> undef, float %add.i, i32 0 61 %r.i = getelementptr inbounds %class.Complex, ptr %arrayidx2, i64 0, i32 1 62 %tmp5 = load float, ptr %r.i, align 4 63 %add5.i = fadd float %tmp5, %tmp3 64 %retval.sroa.0.4.vec.insert.i = insertelement <2 x float> %retval.sroa.0.0.vec.insert.i, float %add5.i, i32 1 65 store <2 x float> %retval.sroa.0.4.vec.insert.i, ptr %arrayidx, align 4 66 ret void 67} 68 69; Function Attrs: nounwind 70declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1) #1 71 72; Function Attrs: nounwind 73declare void @llvm.lifetime.start.p0(i64, ptr nocapture) 74 75; Function Attrs: nounwind 76declare void @llvm.lifetime.end.p0(i64, ptr nocapture) 77 78; Check that we do not read outside of the chunk of bits of the original loads. 79; 80; The 64-bits should have been split in one 32-bits and one 16-bits slices. 81; The 16-bits should be zero extended to match the final type. 82; 83; The memory layout is: 84; LSB 0 1 2 3 | 4 5 | 6 7 MSB 85; Low High 86; The base address points to 0 and is 8-bytes aligned. 87; Low slice starts at 0 (base) and is 8-bytes aligned. 88; High slice starts at 6 (base + 6-bytes) and is 2-bytes aligned. 89; 90; STRESS-LABEL: _t2: 91; STRESS: movzwl 6([[BASE:[^)]+]]), %eax 92; STRESS-NEXT: addl ([[BASE]]), %eax 93; STRESS-NEXT: ret 94; 95; For the REGULAR heuristic, this is not profitable to slice things that are not 96; next to each other in memory. Here we have a hole with bytes #4-5. 97; REGULAR-LABEL: _t2: 98; REGULAR: shrq $48 99define i32 @t2(ptr nocapture %out, i64 %out_start) { 100 %arrayidx = getelementptr inbounds %class.Complex, ptr %out, i64 %out_start 101 %chunk64 = load i64, ptr %arrayidx, align 8 102 %slice32_low = trunc i64 %chunk64 to i32 103 %shift48 = lshr i64 %chunk64, 48 104 %slice32_high = trunc i64 %shift48 to i32 105 %res = add i32 %slice32_high, %slice32_low 106 ret i32 %res 107} 108 109; Check that we do not optimize overlapping slices. 110; 111; The 64-bits should NOT have been split in as slices are overlapping. 112; First slice uses bytes numbered 0 to 3. 113; Second slice uses bytes numbered 6 and 7. 114; Third slice uses bytes numbered 4 to 7. 115; 116; STRESS-LABEL: _t3: 117; STRESS: shrq $48 118; STRESS: shrq $32 119; 120; REGULAR-LABEL: _t3: 121; REGULAR: shrq $48 122; REGULAR: shrq $32 123define i32 @t3(ptr nocapture %out, i64 %out_start) { 124 %arrayidx = getelementptr inbounds %class.Complex, ptr %out, i64 %out_start 125 %chunk64 = load i64, ptr %arrayidx, align 8 126 %slice32_low = trunc i64 %chunk64 to i32 127 %shift48 = lshr i64 %chunk64, 48 128 %slice32_high = trunc i64 %shift48 to i32 129 %shift32 = lshr i64 %chunk64, 32 130 %slice32_lowhigh = trunc i64 %shift32 to i32 131 %tmpres = add i32 %slice32_high, %slice32_low 132 %res = add i32 %slice32_lowhigh, %tmpres 133 ret i32 %res 134} 135