1; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s 2 3; CHECK-LABEL: overlap_1 4; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 1 5; CHECK: [[GEP_B:%[^ ]+]] = getelementptr i16, i16* %b, i32 1 6; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32* 7; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] 8; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32* 9; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] 10; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc 11define i32 @overlap_1(i16* %a, i16* %b, i32 %acc) { 12entry: 13 %addr.a.1 = getelementptr i16, i16* %a, i32 1 14 %addr.b.1 = getelementptr i16, i16* %b, i32 1 15 %ld.a.0 = load i16, i16* %a 16 %sext.a.0 = sext i16 %ld.a.0 to i32 17 %ld.b.0 = load i16, i16* %b 18 %ld.a.1 = load i16, i16* %addr.a.1 19 %ld.b.1 = load i16, i16* %addr.b.1 20 %sext.a.1 = sext i16 %ld.a.1 to i32 21 %sext.b.1 = sext i16 %ld.b.1 to i32 22 %sext.b.0 = sext i16 %ld.b.0 to i32 23 %mul.0 = mul i32 %sext.a.0, %sext.b.0 24 %mul.1 = mul i32 %sext.a.1, %sext.b.1 25 %addr.a.2 = getelementptr i16, i16* %a, i32 2 26 %addr.b.2 = getelementptr i16, i16* %b, i32 2 27 %ld.a.2 = load i16, i16* %addr.a.2 28 %ld.b.2 = load i16, i16* %addr.b.2 29 %sext.a.2 = sext i16 %ld.a.2 to i32 30 %sext.b.2 = sext i16 %ld.b.2 to i32 31 %mul.2 = mul i32 %sext.a.2, %sext.b.2 32 %add = add i32 %mul.0, %mul.1 33 %add.1 = add i32 %mul.1, %mul.2 34 %add.2 = add i32 %add.1, %add 35 %res = add i32 %add.2, %acc 36 ret i32 %res 37} 38 39; CHECK-LABEL: overlap_2 40; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* 41; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] 42; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* 43; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] 44; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 %acc 45define i32 @overlap_2(i16* %a, i16* %b, i32 %acc) { 46entry: 47 %addr.a.1 = getelementptr i16, i16* %a, i32 1 48 %addr.b.1 = getelementptr i16, i16* %b, i32 1 49 %ld.a.0 = load i16, i16* %a 50 %sext.a.0 = sext i16 %ld.a.0 to i32 51 %ld.b.0 = load i16, i16* %b 52 %ld.a.1 = load i16, i16* %addr.a.1 53 %ld.b.1 = load i16, i16* %addr.b.1 54 %sext.a.1 = sext i16 %ld.a.1 to i32 55 %sext.b.1 = sext i16 %ld.b.1 to i32 56 %sext.b.0 = sext i16 %ld.b.0 to i32 57 %mul.0 = mul i32 %sext.a.0, %sext.b.0 58 %mul.1 = mul i32 %sext.a.1, %sext.b.1 59 %addr.a.2 = getelementptr i16, i16* %a, i32 2 60 %addr.b.2 = getelementptr i16, i16* %b, i32 2 61 %ld.a.2 = load i16, i16* %addr.a.2 62 %ld.b.2 = load i16, i16* %addr.b.2 63 %sext.a.2 = sext i16 %ld.a.2 to i32 64 %sext.b.2 = sext i16 %ld.b.2 to i32 65 %mul.2 = mul i32 %sext.b.2, %sext.a.2 66 %add = add i32 %mul.0, %mul.1 67 %add.1 = add i32 %mul.1, %mul.2 68 %add.2 = add i32 %add, %add.1 69 %res = add i32 %add.2, %acc 70 ret i32 %res 71} 72 73; CHECK-LABEL: overlap_3 74; CHECK: [[GEP_B:%[^ ]+]] = getelementptr i16, i16* %b, i32 1 75; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* 76; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] 77; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* 78; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] 79; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32* 80; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] 81; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 82; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32* 83; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] 84; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smlad(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 %acc) 85; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[SMLAD]]) 86define i32 @overlap_3(i16* %a, i16* %b, i32 %acc) { 87entry: 88 %addr.a.1 = getelementptr i16, i16* %a, i32 1 89 %addr.b.1 = getelementptr i16, i16* %b, i32 1 90 %ld.a.0 = load i16, i16* %a 91 %sext.a.0 = sext i16 %ld.a.0 to i32 92 %ld.b.0 = load i16, i16* %b 93 %ld.a.1 = load i16, i16* %addr.a.1 94 %ld.b.1 = load i16, i16* %addr.b.1 95 %sext.a.1 = sext i16 %ld.a.1 to i32 96 %sext.b.1 = sext i16 %ld.b.1 to i32 97 %sext.b.0 = sext i16 %ld.b.0 to i32 98 %mul.0 = mul i32 %sext.a.0, %sext.b.0 99 %mul.1 = mul i32 %sext.a.1, %sext.b.1 100 %addr.a.2 = getelementptr i16, i16* %a, i32 2 101 %addr.b.2 = getelementptr i16, i16* %b, i32 2 102 %addr.a.3 = getelementptr i16, i16* %a, i32 3 103 %ld.a.2 = load i16, i16* %addr.a.2 104 %ld.b.2 = load i16, i16* %addr.b.2 105 %ld.a.3 = load i16, i16* %addr.a.3 106 %sext.a.2 = sext i16 %ld.a.2 to i32 107 %sext.b.2 = sext i16 %ld.b.2 to i32 108 %sext.a.3 = sext i16 %ld.a.3 to i32 109 %mul.2 = mul i32 %sext.a.2, %sext.b.1 110 %mul.3 = mul i32 %sext.a.3, %sext.b.2 111 %add = add i32 %mul.0, %mul.1 112 %add.1 = add i32 %mul.2, %mul.3 113 %add.2 = add i32 %add.1, %add 114 %res = add i32 %add.2, %acc 115 ret i32 %res 116} 117 118; CHECK-LABEL: overlap_4 119; CHECK: [[GEP_B:%[^ ]+]] = getelementptr i16, i16* %b, i32 1 120; CHECK: [[CAST_A:%[^ ]+]] = bitcast i16* %a to i32* 121; CHECK: [[LD_A:%[^ ]+]] = load i32, i32* [[CAST_A]] 122; CHECK: [[CAST_B:%[^ ]+]] = bitcast i16* %b to i32* 123; CHECK: [[LD_B:%[^ ]+]] = load i32, i32* [[CAST_B]] 124; CHECK: [[CAST_B_1:%[^ ]+]] = bitcast i16* [[GEP_B]] to i32* 125; CHECK: [[LD_B_1:%[^ ]+]] = load i32, i32* [[CAST_B_1]] 126; CHECK: [[GEP_A:%[^ ]+]] = getelementptr i16, i16* %a, i32 2 127; CHECK: [[CAST_A_2:%[^ ]+]] = bitcast i16* [[GEP_A]] to i32* 128; CHECK: [[LD_A_2:%[^ ]+]] = load i32, i32* [[CAST_A_2]] 129; CHECK: [[SMLAD:%[^ ]+]] = call i32 @llvm.arm.smladx(i32 [[LD_A_2]], i32 [[LD_B_1]], i32 %acc) 130; CHECK: call i32 @llvm.arm.smlad(i32 [[LD_A]], i32 [[LD_B]], i32 [[SMLAD]]) 131define i32 @overlap_4(i16* %a, i16* %b, i32 %acc) { 132entry: 133 %addr.a.1 = getelementptr i16, i16* %a, i32 1 134 %addr.b.1 = getelementptr i16, i16* %b, i32 1 135 %ld.a.0 = load i16, i16* %a 136 %sext.a.0 = sext i16 %ld.a.0 to i32 137 %ld.b.0 = load i16, i16* %b 138 %ld.a.1 = load i16, i16* %addr.a.1 139 %ld.b.1 = load i16, i16* %addr.b.1 140 %sext.a.1 = sext i16 %ld.a.1 to i32 141 %sext.b.1 = sext i16 %ld.b.1 to i32 142 %sext.b.0 = sext i16 %ld.b.0 to i32 143 %mul.0 = mul i32 %sext.a.0, %sext.b.0 144 %mul.1 = mul i32 %sext.a.1, %sext.b.1 145 %addr.a.2 = getelementptr i16, i16* %a, i32 2 146 %addr.b.2 = getelementptr i16, i16* %b, i32 2 147 %addr.a.3 = getelementptr i16, i16* %a, i32 3 148 %ld.a.2 = load i16, i16* %addr.a.2 149 %ld.b.2 = load i16, i16* %addr.b.2 150 %ld.a.3 = load i16, i16* %addr.a.3 151 %sext.a.2 = sext i16 %ld.a.2 to i32 152 %sext.b.2 = sext i16 %ld.b.2 to i32 153 %sext.a.3 = sext i16 %ld.a.3 to i32 154 %mul.2 = mul i32 %sext.b.2, %sext.a.2 155 %mul.3 = mul i32 %sext.b.1, %sext.a.3 156 %add = add i32 %mul.0, %mul.1 157 %add.1 = add i32 %mul.2, %mul.3 158 %add.2 = add i32 %add.1, %add 159 %res = add i32 %add.2, %acc 160 ret i32 %res 161} 162