1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 2; RUN: llc -mtriple=aarch64 -verify-machineinstrs < %s -o - | FileCheck %s 3 4define void @histogram_i64(<vscale x 2 x ptr> %buckets, i64 %inc, <vscale x 2 x i1> %mask) #0 { 5; CHECK-LABEL: histogram_i64: 6; CHECK: // %bb.0: 7; CHECK-NEXT: histcnt z1.d, p0/z, z0.d, z0.d 8; CHECK-NEXT: mov z3.d, x0 9; CHECK-NEXT: ld1d { z2.d }, p0/z, [z0.d] 10; CHECK-NEXT: ptrue p1.d 11; CHECK-NEXT: mad z1.d, p1/m, z3.d, z2.d 12; CHECK-NEXT: st1d { z1.d }, p0, [z0.d] 13; CHECK-NEXT: ret 14 call void @llvm.experimental.vector.histogram.add.nxv2p0.i64(<vscale x 2 x ptr> %buckets, i64 %inc, <vscale x 2 x i1> %mask) 15 ret void 16} 17 18;; FIXME: We maybe need some dagcombines here? We're multiplying the output of the histcnt 19;; by 1, so we should be able to remove that and directly add the histcnt to the 20;; current bucket data. 21define void @histogram_i32_literal(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask) #0 { 22; CHECK-LABEL: histogram_i32_literal: 23; CHECK: // %bb.0: 24; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s 25; CHECK-NEXT: mov z3.s, #1 // =0x1 26; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, sxtw #2] 27; CHECK-NEXT: ptrue p1.s 28; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s 29; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, sxtw #2] 30; CHECK-NEXT: ret 31 32 %buckets = getelementptr i32, ptr %base, <vscale x 4 x i32> %indices 33 call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask) 34 ret void 35} 36 37define void @histogram_i32_literal_noscale(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask) #0 { 38; CHECK-LABEL: histogram_i32_literal_noscale: 39; CHECK: // %bb.0: 40; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s 41; CHECK-NEXT: mov z3.s, #1 // =0x1 42; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, sxtw] 43; CHECK-NEXT: ptrue p1.s 44; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s 45; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, sxtw] 46; CHECK-NEXT: ret 47 48 %buckets = getelementptr i8, ptr %base, <vscale x 4 x i32> %indices 49 call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask) 50 ret void 51} 52 53define void @histogram_i32_promote(ptr %base, <vscale x 2 x i64> %indices, <vscale x 2 x i1> %mask, i32 %inc) #0 { 54; CHECK-LABEL: histogram_i32_promote: 55; CHECK: // %bb.0: 56; CHECK-NEXT: histcnt z1.d, p0/z, z0.d, z0.d 57; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 58; CHECK-NEXT: mov z3.d, x1 59; CHECK-NEXT: ld1w { z2.d }, p0/z, [x0, z0.d, lsl #2] 60; CHECK-NEXT: ptrue p1.d 61; CHECK-NEXT: mad z1.d, p1/m, z3.d, z2.d 62; CHECK-NEXT: st1w { z1.d }, p0, [x0, z0.d, lsl #2] 63; CHECK-NEXT: ret 64 %buckets = getelementptr i32, ptr %base, <vscale x 2 x i64> %indices 65 call void @llvm.experimental.vector.histogram.add.nxv2p0.i32(<vscale x 2 x ptr> %buckets, i32 %inc, <vscale x 2 x i1> %mask) 66 ret void 67} 68 69define void @histogram_i16(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask, i16 %inc) #0 { 70; CHECK-LABEL: histogram_i16: 71; CHECK: // %bb.0: 72; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s 73; CHECK-NEXT: mov z3.s, w1 74; CHECK-NEXT: ld1h { z2.s }, p0/z, [x0, z0.s, sxtw #1] 75; CHECK-NEXT: ptrue p1.s 76; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s 77; CHECK-NEXT: st1h { z1.s }, p0, [x0, z0.s, sxtw #1] 78; CHECK-NEXT: ret 79 %buckets = getelementptr i16, ptr %base, <vscale x 4 x i32> %indices 80 call void @llvm.experimental.vector.histogram.add.nxv4p0.i16(<vscale x 4 x ptr> %buckets, i16 %inc, <vscale x 4 x i1> %mask) 81 ret void 82} 83 84define void @histogram_i8(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask, i8 %inc) #0 { 85; CHECK-LABEL: histogram_i8: 86; CHECK: // %bb.0: 87; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s 88; CHECK-NEXT: mov z3.s, w1 89; CHECK-NEXT: ld1b { z2.s }, p0/z, [x0, z0.s, sxtw] 90; CHECK-NEXT: ptrue p1.s 91; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s 92; CHECK-NEXT: st1b { z1.s }, p0, [x0, z0.s, sxtw] 93; CHECK-NEXT: ret 94 %buckets = getelementptr i8, ptr %base, <vscale x 4 x i32> %indices 95 call void @llvm.experimental.vector.histogram.add.nxv4p0.i8(<vscale x 4 x ptr> %buckets, i8 %inc, <vscale x 4 x i1> %mask) 96 ret void 97} 98 99define void @histogram_i16_2_lane(ptr %base, <vscale x 2 x i64> %indices, <vscale x 2 x i1> %mask, i16 %inc) #0 { 100; CHECK-LABEL: histogram_i16_2_lane: 101; CHECK: // %bb.0: 102; CHECK-NEXT: histcnt z1.d, p0/z, z0.d, z0.d 103; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 104; CHECK-NEXT: mov z3.d, x1 105; CHECK-NEXT: ld1h { z2.d }, p0/z, [x0, z0.d, lsl #1] 106; CHECK-NEXT: ptrue p1.d 107; CHECK-NEXT: mad z1.d, p1/m, z3.d, z2.d 108; CHECK-NEXT: st1h { z1.d }, p0, [x0, z0.d, lsl #1] 109; CHECK-NEXT: ret 110 %buckets = getelementptr i16, ptr %base, <vscale x 2 x i64> %indices 111 call void @llvm.experimental.vector.histogram.add.nxv2p0.i16(<vscale x 2 x ptr> %buckets, i16 %inc, <vscale x 2 x i1> %mask) 112 ret void 113} 114 115define void @histogram_i8_2_lane(ptr %base, <vscale x 2 x i64> %indices, <vscale x 2 x i1> %mask, i8 %inc) #0 { 116; CHECK-LABEL: histogram_i8_2_lane: 117; CHECK: // %bb.0: 118; CHECK-NEXT: histcnt z1.d, p0/z, z0.d, z0.d 119; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 120; CHECK-NEXT: mov z3.d, x1 121; CHECK-NEXT: ld1b { z2.d }, p0/z, [x0, z0.d] 122; CHECK-NEXT: ptrue p1.d 123; CHECK-NEXT: mad z1.d, p1/m, z3.d, z2.d 124; CHECK-NEXT: st1b { z1.d }, p0, [x0, z0.d] 125; CHECK-NEXT: ret 126 %buckets = getelementptr i8, ptr %base, <vscale x 2 x i64> %indices 127 call void @llvm.experimental.vector.histogram.add.nxv2p0.i8(<vscale x 2 x ptr> %buckets, i8 %inc, <vscale x 2 x i1> %mask) 128 ret void 129} 130 131define void @histogram_i16_literal_1(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask) #0 { 132; CHECK-LABEL: histogram_i16_literal_1: 133; CHECK: // %bb.0: 134; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s 135; CHECK-NEXT: mov z3.s, #1 // =0x1 136; CHECK-NEXT: ld1h { z2.s }, p0/z, [x0, z0.s, sxtw #1] 137; CHECK-NEXT: ptrue p1.s 138; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s 139; CHECK-NEXT: st1h { z1.s }, p0, [x0, z0.s, sxtw #1] 140; CHECK-NEXT: ret 141 %buckets = getelementptr i16, ptr %base, <vscale x 4 x i32> %indices 142 call void @llvm.experimental.vector.histogram.add.nxv4p0.i16(<vscale x 4 x ptr> %buckets, i16 1, <vscale x 4 x i1> %mask) 143 ret void 144} 145 146define void @histogram_i16_literal_2(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask) #0 { 147; CHECK-LABEL: histogram_i16_literal_2: 148; CHECK: // %bb.0: 149; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s 150; CHECK-NEXT: mov z3.s, #2 // =0x2 151; CHECK-NEXT: ld1h { z2.s }, p0/z, [x0, z0.s, sxtw #1] 152; CHECK-NEXT: ptrue p1.s 153; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s 154; CHECK-NEXT: st1h { z1.s }, p0, [x0, z0.s, sxtw #1] 155; CHECK-NEXT: ret 156 %buckets = getelementptr i16, ptr %base, <vscale x 4 x i32> %indices 157 call void @llvm.experimental.vector.histogram.add.nxv4p0.i16(<vscale x 4 x ptr> %buckets, i16 2, <vscale x 4 x i1> %mask) 158 ret void 159} 160 161define void @histogram_i16_literal_3(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask) #0 { 162; CHECK-LABEL: histogram_i16_literal_3: 163; CHECK: // %bb.0: 164; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s 165; CHECK-NEXT: mov z3.s, #3 // =0x3 166; CHECK-NEXT: ld1h { z2.s }, p0/z, [x0, z0.s, sxtw #1] 167; CHECK-NEXT: ptrue p1.s 168; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s 169; CHECK-NEXT: st1h { z1.s }, p0, [x0, z0.s, sxtw #1] 170; CHECK-NEXT: ret 171 %buckets = getelementptr i16, ptr %base, <vscale x 4 x i32> %indices 172 call void @llvm.experimental.vector.histogram.add.nxv4p0.i16(<vscale x 4 x ptr> %buckets, i16 3, <vscale x 4 x i1> %mask) 173 ret void 174} 175 176define void @histogram_i64_4_lane(<vscale x 4 x ptr> %buckets, i64 %inc, <vscale x 4 x i1> %mask) #0 { 177; CHECK-LABEL: histogram_i64_4_lane: 178; CHECK: // %bb.0: 179; CHECK-NEXT: punpklo p1.h, p0.b 180; CHECK-NEXT: mov z4.d, x0 181; CHECK-NEXT: ptrue p2.d 182; CHECK-NEXT: histcnt z2.d, p1/z, z0.d, z0.d 183; CHECK-NEXT: ld1d { z3.d }, p1/z, [z0.d] 184; CHECK-NEXT: punpkhi p0.h, p0.b 185; CHECK-NEXT: mad z2.d, p2/m, z4.d, z3.d 186; CHECK-NEXT: st1d { z2.d }, p1, [z0.d] 187; CHECK-NEXT: histcnt z0.d, p0/z, z1.d, z1.d 188; CHECK-NEXT: ld1d { z2.d }, p0/z, [z1.d] 189; CHECK-NEXT: mad z0.d, p2/m, z4.d, z2.d 190; CHECK-NEXT: st1d { z0.d }, p0, [z1.d] 191; CHECK-NEXT: ret 192 call void @llvm.experimental.vector.histogram.add.nxv4p0.i64(<vscale x 4 x ptr> %buckets, i64 %inc, <vscale x 4 x i1> %mask) 193 ret void 194} 195 196define void @histogram_i64_8_lane(<vscale x 8 x ptr> %buckets, i64 %inc, <vscale x 8 x i1> %mask) #0 { 197; CHECK-LABEL: histogram_i64_8_lane: 198; CHECK: // %bb.0: 199; CHECK-NEXT: punpklo p2.h, p0.b 200; CHECK-NEXT: mov z6.d, x0 201; CHECK-NEXT: ptrue p1.d 202; CHECK-NEXT: punpklo p3.h, p2.b 203; CHECK-NEXT: punpkhi p2.h, p2.b 204; CHECK-NEXT: histcnt z4.d, p3/z, z0.d, z0.d 205; CHECK-NEXT: ld1d { z5.d }, p3/z, [z0.d] 206; CHECK-NEXT: punpkhi p0.h, p0.b 207; CHECK-NEXT: mad z4.d, p1/m, z6.d, z5.d 208; CHECK-NEXT: st1d { z4.d }, p3, [z0.d] 209; CHECK-NEXT: histcnt z0.d, p2/z, z1.d, z1.d 210; CHECK-NEXT: ld1d { z4.d }, p2/z, [z1.d] 211; CHECK-NEXT: mad z0.d, p1/m, z6.d, z4.d 212; CHECK-NEXT: st1d { z0.d }, p2, [z1.d] 213; CHECK-NEXT: punpklo p2.h, p0.b 214; CHECK-NEXT: punpkhi p0.h, p0.b 215; CHECK-NEXT: histcnt z0.d, p2/z, z2.d, z2.d 216; CHECK-NEXT: ld1d { z1.d }, p2/z, [z2.d] 217; CHECK-NEXT: mad z0.d, p1/m, z6.d, z1.d 218; CHECK-NEXT: st1d { z0.d }, p2, [z2.d] 219; CHECK-NEXT: histcnt z0.d, p0/z, z3.d, z3.d 220; CHECK-NEXT: ld1d { z1.d }, p0/z, [z3.d] 221; CHECK-NEXT: mad z0.d, p1/m, z6.d, z1.d 222; CHECK-NEXT: st1d { z0.d }, p0, [z3.d] 223; CHECK-NEXT: ret 224 call void @llvm.experimental.vector.histogram.add.nxv8p0.i64(<vscale x 8 x ptr> %buckets, i64 %inc, <vscale x 8 x i1> %mask) 225 ret void 226} 227 228define void @histogram_i32_8_lane(ptr %base, <vscale x 8 x i32> %indices, i32 %inc, <vscale x 8 x i1> %mask) #0 { 229; CHECK-LABEL: histogram_i32_8_lane: 230; CHECK: // %bb.0: 231; CHECK-NEXT: punpklo p1.h, p0.b 232; CHECK-NEXT: mov z4.s, w1 233; CHECK-NEXT: ptrue p2.s 234; CHECK-NEXT: histcnt z2.s, p1/z, z0.s, z0.s 235; CHECK-NEXT: ld1w { z3.s }, p1/z, [x0, z0.s, sxtw #2] 236; CHECK-NEXT: punpkhi p0.h, p0.b 237; CHECK-NEXT: mad z2.s, p2/m, z4.s, z3.s 238; CHECK-NEXT: st1w { z2.s }, p1, [x0, z0.s, sxtw #2] 239; CHECK-NEXT: histcnt z0.s, p0/z, z1.s, z1.s 240; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z1.s, sxtw #2] 241; CHECK-NEXT: mad z0.s, p2/m, z4.s, z2.s 242; CHECK-NEXT: st1w { z0.s }, p0, [x0, z1.s, sxtw #2] 243; CHECK-NEXT: ret 244 %buckets = getelementptr i32, ptr %base, <vscale x 8 x i32> %indices 245 call void @llvm.experimental.vector.histogram.add.nxv8p0.i32(<vscale x 8 x ptr> %buckets, i32 %inc, <vscale x 8 x i1> %mask) 246 ret void 247} 248 249define void @histogram_i16_8_lane(ptr %base, <vscale x 8 x i32> %indices, i16 %inc, <vscale x 8 x i1> %mask) #0 { 250; CHECK-LABEL: histogram_i16_8_lane: 251; CHECK: // %bb.0: 252; CHECK-NEXT: punpklo p1.h, p0.b 253; CHECK-NEXT: mov z4.s, w1 254; CHECK-NEXT: ptrue p2.s 255; CHECK-NEXT: histcnt z2.s, p1/z, z0.s, z0.s 256; CHECK-NEXT: ld1h { z3.s }, p1/z, [x0, z0.s, sxtw #1] 257; CHECK-NEXT: punpkhi p0.h, p0.b 258; CHECK-NEXT: mad z2.s, p2/m, z4.s, z3.s 259; CHECK-NEXT: st1h { z2.s }, p1, [x0, z0.s, sxtw #1] 260; CHECK-NEXT: histcnt z0.s, p0/z, z1.s, z1.s 261; CHECK-NEXT: ld1h { z2.s }, p0/z, [x0, z1.s, sxtw #1] 262; CHECK-NEXT: mad z0.s, p2/m, z4.s, z2.s 263; CHECK-NEXT: st1h { z0.s }, p0, [x0, z1.s, sxtw #1] 264; CHECK-NEXT: ret 265 %buckets = getelementptr i16, ptr %base, <vscale x 8 x i32> %indices 266 call void @llvm.experimental.vector.histogram.add.nxv8p0.i16(<vscale x 8 x ptr> %buckets, i16 %inc, <vscale x 8 x i1> %mask) 267 ret void 268} 269 270define void @histogram_i8_zext(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask, i8 %inc) #0{ 271; CHECK-LABEL: histogram_i8_zext: 272; CHECK: // %bb.0: 273; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s 274; CHECK-NEXT: mov z3.s, w1 275; CHECK-NEXT: ld1b { z2.s }, p0/z, [x0, z0.s, uxtw] 276; CHECK-NEXT: ptrue p1.s 277; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s 278; CHECK-NEXT: st1b { z1.s }, p0, [x0, z0.s, uxtw] 279; CHECK-NEXT: ret 280 %extended = zext <vscale x 4 x i32> %indices to <vscale x 4 x i64> 281 %buckets = getelementptr i8, ptr %base, <vscale x 4 x i64> %extended 282 call void @llvm.experimental.vector.histogram.add.nxv4p0.i8(<vscale x 4 x ptr> %buckets, i8 %inc, <vscale x 4 x i1> %mask) 283 ret void 284} 285 286define void @histogram_i16_zext(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask, i16 %inc) #0{ 287; CHECK-LABEL: histogram_i16_zext: 288; CHECK: // %bb.0: 289; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s 290; CHECK-NEXT: mov z3.s, w1 291; CHECK-NEXT: ld1h { z2.s }, p0/z, [x0, z0.s, uxtw #1] 292; CHECK-NEXT: ptrue p1.s 293; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s 294; CHECK-NEXT: st1h { z1.s }, p0, [x0, z0.s, uxtw #1] 295; CHECK-NEXT: ret 296 %extended = zext <vscale x 4 x i32> %indices to <vscale x 4 x i64> 297 %buckets = getelementptr i16, ptr %base, <vscale x 4 x i64> %extended 298 call void @llvm.experimental.vector.histogram.add.nxv4p0.i16(<vscale x 4 x ptr> %buckets, i16 %inc, <vscale x 4 x i1> %mask) 299 ret void 300} 301 302define void @histogram_i32_zext(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask) #0 { 303; CHECK-LABEL: histogram_i32_zext: 304; CHECK: // %bb.0: 305; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s 306; CHECK-NEXT: mov z3.s, #1 // =0x1 307; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, uxtw #2] 308; CHECK-NEXT: ptrue p1.s 309; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s 310; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, uxtw #2] 311; CHECK-NEXT: ret 312 %extended = zext <vscale x 4 x i32> %indices to <vscale x 4 x i64> 313 %buckets = getelementptr i32, ptr %base, <vscale x 4 x i64> %extended 314 call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask) 315 ret void 316} 317 318define void @histogram_i32_sext(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask) #0 { 319; CHECK-LABEL: histogram_i32_sext: 320; CHECK: // %bb.0: 321; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s 322; CHECK-NEXT: mov z3.s, #1 // =0x1 323; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, sxtw #2] 324; CHECK-NEXT: ptrue p1.s 325; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s 326; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, sxtw #2] 327; CHECK-NEXT: ret 328 %extended = sext <vscale x 4 x i32> %indices to <vscale x 4 x i64> 329 %buckets = getelementptr i32, ptr %base, <vscale x 4 x i64> %extended 330 call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask) 331 ret void 332} 333 334define void @histogram_zext_from_i8_to_i64(ptr %base, <vscale x 4 x i8> %indices, <vscale x 4 x i1> %mask) #0{ 335; CHECK-LABEL: histogram_zext_from_i8_to_i64: 336; CHECK: // %bb.0: 337; CHECK-NEXT: and z0.s, z0.s, #0xff 338; CHECK-NEXT: mov z3.s, #1 // =0x1 339; CHECK-NEXT: ptrue p1.s 340; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s 341; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, uxtw #2] 342; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s 343; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, uxtw #2] 344; CHECK-NEXT: ret 345 %extended = zext <vscale x 4 x i8> %indices to <vscale x 4 x i64> 346 %buckets = getelementptr i32, ptr %base, <vscale x 4 x i64> %extended 347 call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask) 348 ret void 349} 350 351define void @histogram_zext_from_i16_to_i64(ptr %base, <vscale x 4 x i16> %indices, <vscale x 4 x i1> %mask) #0{ 352; CHECK-LABEL: histogram_zext_from_i16_to_i64: 353; CHECK: // %bb.0: 354; CHECK-NEXT: and z0.s, z0.s, #0xffff 355; CHECK-NEXT: mov z3.s, #1 // =0x1 356; CHECK-NEXT: ptrue p1.s 357; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s 358; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, uxtw #2] 359; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s 360; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, uxtw #2] 361; CHECK-NEXT: ret 362 %extended = zext <vscale x 4 x i16> %indices to <vscale x 4 x i64> 363 %buckets = getelementptr i32, ptr %base, <vscale x 4 x i64> %extended 364 call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask) 365 ret void 366} 367 368define void @histogram_sext_from_i16_to_i64(ptr %base, <vscale x 4 x i16> %indices, <vscale x 4 x i1> %mask) #0{ 369; CHECK-LABEL: histogram_sext_from_i16_to_i64: 370; CHECK: // %bb.0: 371; CHECK-NEXT: ptrue p1.s 372; CHECK-NEXT: mov z3.s, #1 // =0x1 373; CHECK-NEXT: sxth z0.s, p1/m, z0.s 374; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s 375; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, sxtw #2] 376; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s 377; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, sxtw #2] 378; CHECK-NEXT: ret 379 %extended = sext <vscale x 4 x i16> %indices to <vscale x 4 x i64> 380 %buckets = getelementptr i32, ptr %base, <vscale x 4 x i64> %extended 381 call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask) 382 ret void 383} 384 385define void @histogram_zext_from_i8_to_i32(ptr %base, <vscale x 4 x i8> %indices, <vscale x 4 x i1> %mask) #0{ 386; CHECK-LABEL: histogram_zext_from_i8_to_i32: 387; CHECK: // %bb.0: 388; CHECK-NEXT: and z0.s, z0.s, #0xff 389; CHECK-NEXT: mov z3.s, #1 // =0x1 390; CHECK-NEXT: ptrue p1.s 391; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s 392; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, uxtw #2] 393; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s 394; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, uxtw #2] 395; CHECK-NEXT: ret 396 %extended = zext <vscale x 4 x i8> %indices to <vscale x 4 x i32> 397 %buckets = getelementptr i32, ptr %base, <vscale x 4 x i32> %extended 398 call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask) 399 ret void 400} 401 402define void @histogram_zext_from_i16_to_i32(ptr %base, <vscale x 4 x i16> %indices, <vscale x 4 x i1> %mask) #0 { 403; CHECK-LABEL: histogram_zext_from_i16_to_i32: 404; CHECK: // %bb.0: 405; CHECK-NEXT: and z0.s, z0.s, #0xffff 406; CHECK-NEXT: mov z3.s, #1 // =0x1 407; CHECK-NEXT: ptrue p1.s 408; CHECK-NEXT: histcnt z1.s, p0/z, z0.s, z0.s 409; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z0.s, uxtw #2] 410; CHECK-NEXT: mad z1.s, p1/m, z3.s, z2.s 411; CHECK-NEXT: st1w { z1.s }, p0, [x0, z0.s, uxtw #2] 412; CHECK-NEXT: ret 413 %extended = zext <vscale x 4 x i16> %indices to <vscale x 4 x i32> 414 %buckets = getelementptr i32, ptr %base, <vscale x 4 x i32> %extended 415 call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask) 416 ret void 417} 418 419define void @histogram_2_lane_zext(ptr %base, <vscale x 2 x i32> %indices, <vscale x 2 x i1> %mask) #0 { 420; CHECK-LABEL: histogram_2_lane_zext: 421; CHECK: // %bb.0: 422; CHECK-NEXT: mov z1.d, z0.d 423; CHECK-NEXT: mov z3.d, #1 // =0x1 424; CHECK-NEXT: ptrue p1.d 425; CHECK-NEXT: ld1w { z2.d }, p0/z, [x0, z0.d, uxtw #2] 426; CHECK-NEXT: and z1.d, z1.d, #0xffffffff 427; CHECK-NEXT: histcnt z1.d, p0/z, z1.d, z1.d 428; CHECK-NEXT: mad z1.d, p1/m, z3.d, z2.d 429; CHECK-NEXT: st1w { z1.d }, p0, [x0, z0.d, uxtw #2] 430; CHECK-NEXT: ret 431 %extended = zext <vscale x 2 x i32> %indices to <vscale x 2 x i64> 432 %buckets = getelementptr i32, ptr %base, <vscale x 2 x i64> %extended 433 call void @llvm.experimental.vector.histogram.add.nxv2p0.i32(<vscale x 2 x ptr> %buckets, i32 1, <vscale x 2 x i1> %mask) 434 ret void 435} 436 437define void @histogram_8_lane_zext(ptr %base, <vscale x 8 x i32> %indices, <vscale x 8 x i1> %mask) #0{ 438; CHECK-LABEL: histogram_8_lane_zext: 439; CHECK: // %bb.0: 440; CHECK-NEXT: punpklo p1.h, p0.b 441; CHECK-NEXT: mov z4.s, #1 // =0x1 442; CHECK-NEXT: ptrue p2.s 443; CHECK-NEXT: histcnt z2.s, p1/z, z0.s, z0.s 444; CHECK-NEXT: ld1w { z3.s }, p1/z, [x0, z0.s, uxtw #2] 445; CHECK-NEXT: punpkhi p0.h, p0.b 446; CHECK-NEXT: mad z2.s, p2/m, z4.s, z3.s 447; CHECK-NEXT: st1w { z2.s }, p1, [x0, z0.s, uxtw #2] 448; CHECK-NEXT: histcnt z0.s, p0/z, z1.s, z1.s 449; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z1.s, uxtw #2] 450; CHECK-NEXT: mad z0.s, p2/m, z4.s, z2.s 451; CHECK-NEXT: st1w { z0.s }, p0, [x0, z1.s, uxtw #2] 452; CHECK-NEXT: ret 453 %extended = zext <vscale x 8 x i32> %indices to <vscale x 8 x i64> 454 %buckets = getelementptr i32, ptr %base, <vscale x 8 x i64> %extended 455 call void @llvm.experimental.vector.histogram.add.nxv8p0.i32(<vscale x 8 x ptr> %buckets, i32 1, <vscale x 8 x i1> %mask) 456 ret void 457} 458 459define void @histogram_8_lane_sext(ptr %base, <vscale x 8 x i32> %indices, <vscale x 8 x i1> %mask) #0{ 460; CHECK-LABEL: histogram_8_lane_sext: 461; CHECK: // %bb.0: 462; CHECK-NEXT: punpklo p1.h, p0.b 463; CHECK-NEXT: mov z4.s, #1 // =0x1 464; CHECK-NEXT: ptrue p2.s 465; CHECK-NEXT: histcnt z2.s, p1/z, z0.s, z0.s 466; CHECK-NEXT: ld1w { z3.s }, p1/z, [x0, z0.s, sxtw #2] 467; CHECK-NEXT: punpkhi p0.h, p0.b 468; CHECK-NEXT: mad z2.s, p2/m, z4.s, z3.s 469; CHECK-NEXT: st1w { z2.s }, p1, [x0, z0.s, sxtw #2] 470; CHECK-NEXT: histcnt z0.s, p0/z, z1.s, z1.s 471; CHECK-NEXT: ld1w { z2.s }, p0/z, [x0, z1.s, sxtw #2] 472; CHECK-NEXT: mad z0.s, p2/m, z4.s, z2.s 473; CHECK-NEXT: st1w { z0.s }, p0, [x0, z1.s, sxtw #2] 474; CHECK-NEXT: ret 475 %extended = sext <vscale x 8 x i32> %indices to <vscale x 8 x i64> 476 %buckets = getelementptr i32, ptr %base, <vscale x 8 x i64> %extended 477 call void @llvm.experimental.vector.histogram.add.nxv8p0.i32(<vscale x 8 x ptr> %buckets, i32 1, <vscale x 8 x i1> %mask) 478 ret void 479} 480 481define void @histogram_zero_mask(<vscale x 2 x ptr> %buckets, i64 %inc, <vscale x 2 x i1> %mask) #0{ 482; CHECK-LABEL: histogram_zero_mask: 483; CHECK: // %bb.0: 484; CHECK-NEXT: ret 485 call void @llvm.experimental.vector.histogram.add.nxv2p0.i64(<vscale x 2 x ptr> %buckets, i64 %inc, <vscale x 2 x i1> zeroinitializer) 486 ret void 487} 488 489define void @histogram_sext_zero_mask(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask) #0{ 490; CHECK-LABEL: histogram_sext_zero_mask: 491; CHECK: // %bb.0: 492; CHECK-NEXT: ret 493 %extended = sext <vscale x 4 x i32> %indices to <vscale x 4 x i64> 494 %buckets = getelementptr i32, ptr %base, <vscale x 4 x i64> %extended 495 call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> zeroinitializer) 496 ret void 497} 498 499attributes #0 = { "target-features"="+sve2" vscale_range(1, 16) } 500