1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -arm-parallel-dsp -mtriple=armv7-a -S %s -o - | FileCheck %s 3 4define i32 @overlap_1(ptr %a, ptr %b, i32 %acc) { 5; CHECK-LABEL: @overlap_1( 6; CHECK-NEXT: entry: 7; CHECK-NEXT: [[ADDR_A_1:%.*]] = getelementptr i16, ptr [[A:%.*]], i32 1 8; CHECK-NEXT: [[ADDR_B_1:%.*]] = getelementptr i16, ptr [[B:%.*]], i32 1 9; CHECK-NEXT: [[LD_A_0:%.*]] = load i16, ptr [[A]], align 2 10; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[A]], align 2 11; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 12; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP2]] to i32 13; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP1]], 16 14; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 15; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[TMP5]] to i32 16; CHECK-NEXT: [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32 17; CHECK-NEXT: [[LD_B_0:%.*]] = load i16, ptr [[B]], align 2 18; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[B]], align 2 19; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16 20; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.arm.smlad(i32 [[TMP1]], i32 [[TMP8]], i32 [[ACC:%.*]]) 21; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[TMP9]] to i32 22; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP8]], 16 23; CHECK-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16 24; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[TMP13]] to i32 25; CHECK-NEXT: [[LD_A_1:%.*]] = load i16, ptr [[ADDR_A_1]], align 2 26; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[ADDR_A_1]], align 2 27; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16 28; CHECK-NEXT: [[TMP18:%.*]] = sext i16 [[TMP17]] to i32 29; CHECK-NEXT: [[TMP19:%.*]] = lshr i32 [[TMP16]], 16 30; CHECK-NEXT: [[TMP20:%.*]] = trunc i32 [[TMP19]] to i16 31; CHECK-NEXT: [[TMP21:%.*]] = sext i16 [[TMP20]] to i32 32; CHECK-NEXT: [[LD_B_1:%.*]] = load i16, ptr [[ADDR_B_1]], align 2 33; CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[ADDR_B_1]], align 2 34; CHECK-NEXT: [[TMP24:%.*]] = trunc i32 [[TMP23]] to i16 35; CHECK-NEXT: [[TMP25:%.*]] = call i32 @llvm.arm.smlad(i32 [[TMP16]], i32 [[TMP23]], i32 [[TMP10]]) 36; CHECK-NEXT: [[TMP26:%.*]] = sext i16 [[TMP24]] to i32 37; CHECK-NEXT: [[TMP27:%.*]] = lshr i32 [[TMP23]], 16 38; CHECK-NEXT: [[TMP28:%.*]] = trunc i32 [[TMP27]] to i16 39; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP28]] to i32 40; CHECK-NEXT: [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32 41; CHECK-NEXT: [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32 42; CHECK-NEXT: [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32 43; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[TMP3]], [[TMP11]] 44; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[TMP6]], [[TMP14]] 45; CHECK-NEXT: [[ADDR_A_2:%.*]] = getelementptr i16, ptr [[A]], i32 2 46; CHECK-NEXT: [[ADDR_B_2:%.*]] = getelementptr i16, ptr [[B]], i32 2 47; CHECK-NEXT: [[LD_A_2:%.*]] = load i16, ptr [[ADDR_A_2]], align 2 48; CHECK-NEXT: [[LD_B_2:%.*]] = load i16, ptr [[ADDR_B_2]], align 2 49; CHECK-NEXT: [[SEXT_A_2:%.*]] = sext i16 [[LD_A_2]] to i32 50; CHECK-NEXT: [[SEXT_B_2:%.*]] = sext i16 [[LD_B_2]] to i32 51; CHECK-NEXT: [[MUL_2:%.*]] = mul i32 [[TMP21]], [[TMP29]] 52; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]] 53; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[MUL_1]], [[MUL_2]] 54; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[ADD_1]], [[ADD]] 55; CHECK-NEXT: [[RES:%.*]] = add i32 [[ADD_2]], [[ACC]] 56; CHECK-NEXT: ret i32 [[TMP25]] 57; 58entry: 59 %addr.a.1 = getelementptr i16, ptr %a, i32 1 60 %addr.b.1 = getelementptr i16, ptr %b, i32 1 61 %ld.a.0 = load i16, ptr %a 62 %sext.a.0 = sext i16 %ld.a.0 to i32 63 %ld.b.0 = load i16, ptr %b 64 %ld.a.1 = load i16, ptr %addr.a.1 65 %ld.b.1 = load i16, ptr %addr.b.1 66 %sext.a.1 = sext i16 %ld.a.1 to i32 67 %sext.b.1 = sext i16 %ld.b.1 to i32 68 %sext.b.0 = sext i16 %ld.b.0 to i32 69 %mul.0 = mul i32 %sext.a.0, %sext.b.0 70 %mul.1 = mul i32 %sext.a.1, %sext.b.1 71 %addr.a.2 = getelementptr i16, ptr %a, i32 2 72 %addr.b.2 = getelementptr i16, ptr %b, i32 2 73 %ld.a.2 = load i16, ptr %addr.a.2 74 %ld.b.2 = load i16, ptr %addr.b.2 75 %sext.a.2 = sext i16 %ld.a.2 to i32 76 %sext.b.2 = sext i16 %ld.b.2 to i32 77 %mul.2 = mul i32 %sext.a.2, %sext.b.2 78 %add = add i32 %mul.0, %mul.1 79 %add.1 = add i32 %mul.1, %mul.2 80 %add.2 = add i32 %add.1, %add 81 %res = add i32 %add.2, %acc 82 ret i32 %res 83} 84 85; TODO: Is it really best to generate smlald for the first instruction? Does 86; this just increase register pressure unnecessarily? 87define i64 @overlap_64_1(ptr %a, ptr %b, i64 %acc) { 88; CHECK-LABEL: @overlap_64_1( 89; CHECK-NEXT: entry: 90; CHECK-NEXT: [[ADDR_A_1:%.*]] = getelementptr i16, ptr [[A:%.*]], i32 1 91; CHECK-NEXT: [[ADDR_B_1:%.*]] = getelementptr i16, ptr [[B:%.*]], i32 1 92; CHECK-NEXT: [[LD_A_0:%.*]] = load i16, ptr [[A]], align 2 93; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[A]], align 2 94; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 95; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP2]] to i32 96; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP1]], 16 97; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 98; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[TMP5]] to i32 99; CHECK-NEXT: [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32 100; CHECK-NEXT: [[LD_B_0:%.*]] = load i16, ptr [[B]], align 2 101; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[B]], align 2 102; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16 103; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.arm.smlald(i32 [[TMP1]], i32 [[TMP8]], i64 [[ACC:%.*]]) 104; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[TMP9]] to i32 105; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP8]], 16 106; CHECK-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16 107; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[TMP13]] to i32 108; CHECK-NEXT: [[LD_A_1:%.*]] = load i16, ptr [[ADDR_A_1]], align 2 109; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[ADDR_A_1]], align 2 110; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16 111; CHECK-NEXT: [[TMP18:%.*]] = sext i16 [[TMP17]] to i32 112; CHECK-NEXT: [[TMP19:%.*]] = lshr i32 [[TMP16]], 16 113; CHECK-NEXT: [[TMP20:%.*]] = trunc i32 [[TMP19]] to i16 114; CHECK-NEXT: [[TMP21:%.*]] = sext i16 [[TMP20]] to i32 115; CHECK-NEXT: [[LD_B_1:%.*]] = load i16, ptr [[ADDR_B_1]], align 2 116; CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[ADDR_B_1]], align 2 117; CHECK-NEXT: [[TMP24:%.*]] = trunc i32 [[TMP23]] to i16 118; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.arm.smlald(i32 [[TMP16]], i32 [[TMP23]], i64 [[TMP10]]) 119; CHECK-NEXT: [[TMP26:%.*]] = sext i16 [[TMP24]] to i32 120; CHECK-NEXT: [[TMP27:%.*]] = lshr i32 [[TMP23]], 16 121; CHECK-NEXT: [[TMP28:%.*]] = trunc i32 [[TMP27]] to i16 122; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP28]] to i32 123; CHECK-NEXT: [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32 124; CHECK-NEXT: [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32 125; CHECK-NEXT: [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32 126; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[TMP3]], [[TMP11]] 127; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[TMP6]], [[TMP14]] 128; CHECK-NEXT: [[ADDR_A_2:%.*]] = getelementptr i16, ptr [[A]], i32 2 129; CHECK-NEXT: [[ADDR_B_2:%.*]] = getelementptr i16, ptr [[B]], i32 2 130; CHECK-NEXT: [[LD_A_2:%.*]] = load i16, ptr [[ADDR_A_2]], align 2 131; CHECK-NEXT: [[LD_B_2:%.*]] = load i16, ptr [[ADDR_B_2]], align 2 132; CHECK-NEXT: [[SEXT_A_2:%.*]] = sext i16 [[LD_A_2]] to i32 133; CHECK-NEXT: [[SEXT_B_2:%.*]] = sext i16 [[LD_B_2]] to i32 134; CHECK-NEXT: [[MUL_2:%.*]] = mul i32 [[TMP21]], [[TMP29]] 135; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]] 136; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[MUL_1]], [[MUL_2]] 137; CHECK-NEXT: [[SEXT_ADD:%.*]] = sext i32 [[ADD]] to i64 138; CHECK-NEXT: [[SEXT_ADD_1:%.*]] = sext i32 [[ADD_1]] to i64 139; CHECK-NEXT: [[ADD_2:%.*]] = add i64 [[SEXT_ADD_1]], [[SEXT_ADD]] 140; CHECK-NEXT: [[RES:%.*]] = add i64 [[ADD_2]], [[ACC]] 141; CHECK-NEXT: ret i64 [[TMP25]] 142; 143entry: 144 %addr.a.1 = getelementptr i16, ptr %a, i32 1 145 %addr.b.1 = getelementptr i16, ptr %b, i32 1 146 %ld.a.0 = load i16, ptr %a 147 %sext.a.0 = sext i16 %ld.a.0 to i32 148 %ld.b.0 = load i16, ptr %b 149 %ld.a.1 = load i16, ptr %addr.a.1 150 %ld.b.1 = load i16, ptr %addr.b.1 151 %sext.a.1 = sext i16 %ld.a.1 to i32 152 %sext.b.1 = sext i16 %ld.b.1 to i32 153 %sext.b.0 = sext i16 %ld.b.0 to i32 154 %mul.0 = mul i32 %sext.a.0, %sext.b.0 155 %mul.1 = mul i32 %sext.a.1, %sext.b.1 156 %addr.a.2 = getelementptr i16, ptr %a, i32 2 157 %addr.b.2 = getelementptr i16, ptr %b, i32 2 158 %ld.a.2 = load i16, ptr %addr.a.2 159 %ld.b.2 = load i16, ptr %addr.b.2 160 %sext.a.2 = sext i16 %ld.a.2 to i32 161 %sext.b.2 = sext i16 %ld.b.2 to i32 162 %mul.2 = mul i32 %sext.a.2, %sext.b.2 163 %add = add i32 %mul.0, %mul.1 164 %add.1 = add i32 %mul.1, %mul.2 165 %sext.add = sext i32 %add to i64 166 %sext.add.1 = sext i32 %add.1 to i64 167 %add.2 = add i64 %sext.add.1, %sext.add 168 %res = add i64 %add.2, %acc 169 ret i64 %res 170} 171 172define i32 @overlap_2(ptr %a, ptr %b, i32 %acc) { 173; CHECK-LABEL: @overlap_2( 174; CHECK-NEXT: entry: 175; CHECK-NEXT: [[ADDR_A_1:%.*]] = getelementptr i16, ptr [[A:%.*]], i32 1 176; CHECK-NEXT: [[ADDR_B_1:%.*]] = getelementptr i16, ptr [[B:%.*]], i32 1 177; CHECK-NEXT: [[LD_A_0:%.*]] = load i16, ptr [[A]], align 2 178; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[A]], align 2 179; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 180; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP2]] to i32 181; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP1]], 16 182; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 183; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[TMP5]] to i32 184; CHECK-NEXT: [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32 185; CHECK-NEXT: [[LD_B_0:%.*]] = load i16, ptr [[B]], align 2 186; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[B]], align 2 187; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16 188; CHECK-NEXT: [[TMP10:%.*]] = sext i16 [[TMP9]] to i32 189; CHECK-NEXT: [[TMP11:%.*]] = lshr i32 [[TMP8]], 16 190; CHECK-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i16 191; CHECK-NEXT: [[TMP13:%.*]] = sext i16 [[TMP12]] to i32 192; CHECK-NEXT: [[LD_A_1:%.*]] = load i16, ptr [[ADDR_A_1]], align 2 193; CHECK-NEXT: [[LD_B_1:%.*]] = load i16, ptr [[ADDR_B_1]], align 2 194; CHECK-NEXT: [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32 195; CHECK-NEXT: [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32 196; CHECK-NEXT: [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32 197; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[TMP3]], [[TMP10]] 198; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[TMP6]], [[TMP13]] 199; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[MUL_1]], [[ACC:%.*]] 200; CHECK-NEXT: [[ADDR_A_2:%.*]] = getelementptr i16, ptr [[A]], i32 2 201; CHECK-NEXT: [[ADDR_B_2:%.*]] = getelementptr i16, ptr [[B]], i32 2 202; CHECK-NEXT: [[LD_A_2:%.*]] = load i16, ptr [[ADDR_A_2]], align 2 203; CHECK-NEXT: [[LD_B_2:%.*]] = load i16, ptr [[ADDR_B_2]], align 2 204; CHECK-NEXT: [[SEXT_A_2:%.*]] = sext i16 [[LD_A_2]] to i32 205; CHECK-NEXT: [[SEXT_B_2:%.*]] = sext i16 [[LD_B_2]] to i32 206; CHECK-NEXT: [[MUL_2:%.*]] = mul i32 [[SEXT_B_2]], [[SEXT_A_2]] 207; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[MUL_2]], [[TMP14]] 208; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.arm.smlad(i32 [[TMP1]], i32 [[TMP8]], i32 [[TMP15]]) 209; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]] 210; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[MUL_1]], [[MUL_2]] 211; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[ADD]], [[ADD_1]] 212; CHECK-NEXT: [[RES:%.*]] = add i32 [[ADD_2]], [[ACC]] 213; CHECK-NEXT: ret i32 [[TMP16]] 214; 215entry: 216 %addr.a.1 = getelementptr i16, ptr %a, i32 1 217 %addr.b.1 = getelementptr i16, ptr %b, i32 1 218 %ld.a.0 = load i16, ptr %a 219 %sext.a.0 = sext i16 %ld.a.0 to i32 220 %ld.b.0 = load i16, ptr %b 221 %ld.a.1 = load i16, ptr %addr.a.1 222 %ld.b.1 = load i16, ptr %addr.b.1 223 %sext.a.1 = sext i16 %ld.a.1 to i32 224 %sext.b.1 = sext i16 %ld.b.1 to i32 225 %sext.b.0 = sext i16 %ld.b.0 to i32 226 %mul.0 = mul i32 %sext.a.0, %sext.b.0 227 %mul.1 = mul i32 %sext.a.1, %sext.b.1 228 %addr.a.2 = getelementptr i16, ptr %a, i32 2 229 %addr.b.2 = getelementptr i16, ptr %b, i32 2 230 %ld.a.2 = load i16, ptr %addr.a.2 231 %ld.b.2 = load i16, ptr %addr.b.2 232 %sext.a.2 = sext i16 %ld.a.2 to i32 233 %sext.b.2 = sext i16 %ld.b.2 to i32 234 %mul.2 = mul i32 %sext.b.2, %sext.a.2 235 %add = add i32 %mul.0, %mul.1 236 %add.1 = add i32 %mul.1, %mul.2 237 %add.2 = add i32 %add, %add.1 238 %res = add i32 %add.2, %acc 239 ret i32 %res 240} 241 242define i32 @overlap_3(ptr %a, ptr %b, i32 %acc) { 243; CHECK-LABEL: @overlap_3( 244; CHECK-NEXT: entry: 245; CHECK-NEXT: [[ADDR_A_1:%.*]] = getelementptr i16, ptr [[A:%.*]], i32 1 246; CHECK-NEXT: [[ADDR_B_1:%.*]] = getelementptr i16, ptr [[B:%.*]], i32 1 247; CHECK-NEXT: [[LD_A_0:%.*]] = load i16, ptr [[A]], align 2 248; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[A]], align 2 249; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 250; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP2]] to i32 251; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP1]], 16 252; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 253; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[TMP5]] to i32 254; CHECK-NEXT: [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32 255; CHECK-NEXT: [[LD_B_0:%.*]] = load i16, ptr [[B]], align 2 256; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[B]], align 2 257; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16 258; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.arm.smlad(i32 [[TMP1]], i32 [[TMP8]], i32 [[ACC:%.*]]) 259; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[TMP9]] to i32 260; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP8]], 16 261; CHECK-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16 262; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[TMP13]] to i32 263; CHECK-NEXT: [[LD_A_1:%.*]] = load i16, ptr [[ADDR_A_1]], align 2 264; CHECK-NEXT: [[LD_B_1:%.*]] = load i16, ptr [[ADDR_B_1]], align 2 265; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[ADDR_B_1]], align 2 266; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16 267; CHECK-NEXT: [[TMP18:%.*]] = sext i16 [[TMP17]] to i32 268; CHECK-NEXT: [[TMP19:%.*]] = lshr i32 [[TMP16]], 16 269; CHECK-NEXT: [[TMP20:%.*]] = trunc i32 [[TMP19]] to i16 270; CHECK-NEXT: [[TMP21:%.*]] = sext i16 [[TMP20]] to i32 271; CHECK-NEXT: [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32 272; CHECK-NEXT: [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32 273; CHECK-NEXT: [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32 274; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[TMP3]], [[TMP11]] 275; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[TMP6]], [[TMP14]] 276; CHECK-NEXT: [[ADDR_A_2:%.*]] = getelementptr i16, ptr [[A]], i32 2 277; CHECK-NEXT: [[ADDR_B_2:%.*]] = getelementptr i16, ptr [[B]], i32 2 278; CHECK-NEXT: [[ADDR_A_3:%.*]] = getelementptr i16, ptr [[A]], i32 3 279; CHECK-NEXT: [[LD_A_2:%.*]] = load i16, ptr [[ADDR_A_2]], align 2 280; CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[ADDR_A_2]], align 2 281; CHECK-NEXT: [[TMP24:%.*]] = trunc i32 [[TMP23]] to i16 282; CHECK-NEXT: [[TMP25:%.*]] = call i32 @llvm.arm.smlad(i32 [[TMP23]], i32 [[TMP16]], i32 [[TMP10]]) 283; CHECK-NEXT: [[TMP26:%.*]] = sext i16 [[TMP24]] to i32 284; CHECK-NEXT: [[TMP27:%.*]] = lshr i32 [[TMP23]], 16 285; CHECK-NEXT: [[TMP28:%.*]] = trunc i32 [[TMP27]] to i16 286; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP28]] to i32 287; CHECK-NEXT: [[LD_B_2:%.*]] = load i16, ptr [[ADDR_B_2]], align 2 288; CHECK-NEXT: [[LD_A_3:%.*]] = load i16, ptr [[ADDR_A_3]], align 2 289; CHECK-NEXT: [[SEXT_A_2:%.*]] = sext i16 [[LD_A_2]] to i32 290; CHECK-NEXT: [[SEXT_B_2:%.*]] = sext i16 [[LD_B_2]] to i32 291; CHECK-NEXT: [[SEXT_A_3:%.*]] = sext i16 [[LD_A_3]] to i32 292; CHECK-NEXT: [[MUL_2:%.*]] = mul i32 [[TMP26]], [[TMP14]] 293; CHECK-NEXT: [[MUL_3:%.*]] = mul i32 [[TMP29]], [[TMP21]] 294; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]] 295; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[MUL_2]], [[MUL_3]] 296; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[ADD_1]], [[ADD]] 297; CHECK-NEXT: [[RES:%.*]] = add i32 [[ADD_2]], [[ACC]] 298; CHECK-NEXT: ret i32 [[TMP25]] 299; 300entry: 301 %addr.a.1 = getelementptr i16, ptr %a, i32 1 302 %addr.b.1 = getelementptr i16, ptr %b, i32 1 303 %ld.a.0 = load i16, ptr %a 304 %sext.a.0 = sext i16 %ld.a.0 to i32 305 %ld.b.0 = load i16, ptr %b 306 %ld.a.1 = load i16, ptr %addr.a.1 307 %ld.b.1 = load i16, ptr %addr.b.1 308 %sext.a.1 = sext i16 %ld.a.1 to i32 309 %sext.b.1 = sext i16 %ld.b.1 to i32 310 %sext.b.0 = sext i16 %ld.b.0 to i32 311 %mul.0 = mul i32 %sext.a.0, %sext.b.0 312 %mul.1 = mul i32 %sext.a.1, %sext.b.1 313 %addr.a.2 = getelementptr i16, ptr %a, i32 2 314 %addr.b.2 = getelementptr i16, ptr %b, i32 2 315 %addr.a.3 = getelementptr i16, ptr %a, i32 3 316 %ld.a.2 = load i16, ptr %addr.a.2 317 %ld.b.2 = load i16, ptr %addr.b.2 318 %ld.a.3 = load i16, ptr %addr.a.3 319 %sext.a.2 = sext i16 %ld.a.2 to i32 320 %sext.b.2 = sext i16 %ld.b.2 to i32 321 %sext.a.3 = sext i16 %ld.a.3 to i32 322 %mul.2 = mul i32 %sext.a.2, %sext.b.1 323 %mul.3 = mul i32 %sext.a.3, %sext.b.2 324 %add = add i32 %mul.0, %mul.1 325 %add.1 = add i32 %mul.2, %mul.3 326 %add.2 = add i32 %add.1, %add 327 %res = add i32 %add.2, %acc 328 ret i32 %res 329} 330 331define i32 @overlap_4(ptr %a, ptr %b, i32 %acc) { 332; CHECK-LABEL: @overlap_4( 333; CHECK-NEXT: entry: 334; CHECK-NEXT: [[ADDR_A_1:%.*]] = getelementptr i16, ptr [[A:%.*]], i32 1 335; CHECK-NEXT: [[ADDR_B_1:%.*]] = getelementptr i16, ptr [[B:%.*]], i32 1 336; CHECK-NEXT: [[LD_A_0:%.*]] = load i16, ptr [[A]], align 2 337; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[A]], align 2 338; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16 339; CHECK-NEXT: [[TMP3:%.*]] = sext i16 [[TMP2]] to i32 340; CHECK-NEXT: [[TMP4:%.*]] = lshr i32 [[TMP1]], 16 341; CHECK-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16 342; CHECK-NEXT: [[TMP6:%.*]] = sext i16 [[TMP5]] to i32 343; CHECK-NEXT: [[SEXT_A_0:%.*]] = sext i16 [[LD_A_0]] to i32 344; CHECK-NEXT: [[LD_B_0:%.*]] = load i16, ptr [[B]], align 2 345; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[B]], align 2 346; CHECK-NEXT: [[TMP9:%.*]] = trunc i32 [[TMP8]] to i16 347; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.arm.smlad(i32 [[TMP1]], i32 [[TMP8]], i32 [[ACC:%.*]]) 348; CHECK-NEXT: [[TMP11:%.*]] = sext i16 [[TMP9]] to i32 349; CHECK-NEXT: [[TMP12:%.*]] = lshr i32 [[TMP8]], 16 350; CHECK-NEXT: [[TMP13:%.*]] = trunc i32 [[TMP12]] to i16 351; CHECK-NEXT: [[TMP14:%.*]] = sext i16 [[TMP13]] to i32 352; CHECK-NEXT: [[LD_A_1:%.*]] = load i16, ptr [[ADDR_A_1]], align 2 353; CHECK-NEXT: [[LD_B_1:%.*]] = load i16, ptr [[ADDR_B_1]], align 2 354; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[ADDR_B_1]], align 2 355; CHECK-NEXT: [[TMP17:%.*]] = trunc i32 [[TMP16]] to i16 356; CHECK-NEXT: [[TMP18:%.*]] = sext i16 [[TMP17]] to i32 357; CHECK-NEXT: [[TMP19:%.*]] = lshr i32 [[TMP16]], 16 358; CHECK-NEXT: [[TMP20:%.*]] = trunc i32 [[TMP19]] to i16 359; CHECK-NEXT: [[TMP21:%.*]] = sext i16 [[TMP20]] to i32 360; CHECK-NEXT: [[SEXT_A_1:%.*]] = sext i16 [[LD_A_1]] to i32 361; CHECK-NEXT: [[SEXT_B_1:%.*]] = sext i16 [[LD_B_1]] to i32 362; CHECK-NEXT: [[SEXT_B_0:%.*]] = sext i16 [[LD_B_0]] to i32 363; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[TMP3]], [[TMP11]] 364; CHECK-NEXT: [[MUL_1:%.*]] = mul i32 [[TMP6]], [[TMP14]] 365; CHECK-NEXT: [[ADDR_A_2:%.*]] = getelementptr i16, ptr [[A]], i32 2 366; CHECK-NEXT: [[ADDR_B_2:%.*]] = getelementptr i16, ptr [[B]], i32 2 367; CHECK-NEXT: [[ADDR_A_3:%.*]] = getelementptr i16, ptr [[A]], i32 3 368; CHECK-NEXT: [[LD_A_2:%.*]] = load i16, ptr [[ADDR_A_2]], align 2 369; CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[ADDR_A_2]], align 2 370; CHECK-NEXT: [[TMP24:%.*]] = trunc i32 [[TMP23]] to i16 371; CHECK-NEXT: [[TMP25:%.*]] = call i32 @llvm.arm.smladx(i32 [[TMP23]], i32 [[TMP16]], i32 [[TMP10]]) 372; CHECK-NEXT: [[TMP26:%.*]] = sext i16 [[TMP24]] to i32 373; CHECK-NEXT: [[TMP27:%.*]] = lshr i32 [[TMP23]], 16 374; CHECK-NEXT: [[TMP28:%.*]] = trunc i32 [[TMP27]] to i16 375; CHECK-NEXT: [[TMP29:%.*]] = sext i16 [[TMP28]] to i32 376; CHECK-NEXT: [[LD_B_2:%.*]] = load i16, ptr [[ADDR_B_2]], align 2 377; CHECK-NEXT: [[LD_A_3:%.*]] = load i16, ptr [[ADDR_A_3]], align 2 378; CHECK-NEXT: [[SEXT_A_2:%.*]] = sext i16 [[LD_A_2]] to i32 379; CHECK-NEXT: [[SEXT_B_2:%.*]] = sext i16 [[LD_B_2]] to i32 380; CHECK-NEXT: [[SEXT_A_3:%.*]] = sext i16 [[LD_A_3]] to i32 381; CHECK-NEXT: [[MUL_2:%.*]] = mul i32 [[TMP21]], [[TMP26]] 382; CHECK-NEXT: [[MUL_3:%.*]] = mul i32 [[TMP14]], [[TMP29]] 383; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL_0]], [[MUL_1]] 384; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[MUL_2]], [[MUL_3]] 385; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[ADD_1]], [[ADD]] 386; CHECK-NEXT: [[RES:%.*]] = add i32 [[ADD_2]], [[ACC]] 387; CHECK-NEXT: ret i32 [[TMP25]] 388; 389entry: 390 %addr.a.1 = getelementptr i16, ptr %a, i32 1 391 %addr.b.1 = getelementptr i16, ptr %b, i32 1 392 %ld.a.0 = load i16, ptr %a 393 %sext.a.0 = sext i16 %ld.a.0 to i32 394 %ld.b.0 = load i16, ptr %b 395 %ld.a.1 = load i16, ptr %addr.a.1 396 %ld.b.1 = load i16, ptr %addr.b.1 397 %sext.a.1 = sext i16 %ld.a.1 to i32 398 %sext.b.1 = sext i16 %ld.b.1 to i32 399 %sext.b.0 = sext i16 %ld.b.0 to i32 400 %mul.0 = mul i32 %sext.a.0, %sext.b.0 401 %mul.1 = mul i32 %sext.a.1, %sext.b.1 402 %addr.a.2 = getelementptr i16, ptr %a, i32 2 403 %addr.b.2 = getelementptr i16, ptr %b, i32 2 404 %addr.a.3 = getelementptr i16, ptr %a, i32 3 405 %ld.a.2 = load i16, ptr %addr.a.2 406 %ld.b.2 = load i16, ptr %addr.b.2 407 %ld.a.3 = load i16, ptr %addr.a.3 408 %sext.a.2 = sext i16 %ld.a.2 to i32 409 %sext.b.2 = sext i16 %ld.b.2 to i32 410 %sext.a.3 = sext i16 %ld.a.3 to i32 411 %mul.2 = mul i32 %sext.b.2, %sext.a.2 412 %mul.3 = mul i32 %sext.b.1, %sext.a.3 413 %add = add i32 %mul.0, %mul.1 414 %add.1 = add i32 %mul.2, %mul.3 415 %add.2 = add i32 %add.1, %add 416 %res = add i32 %add.2, %acc 417 ret i32 %res 418} 419