1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 2; RUN: llc < %s | FileCheck -check-prefix=ENABLED %s 3; RUN: llc -disable-nvptx-load-store-vectorizer < %s | FileCheck -check-prefix=DISABLED %s 4; RUN: %if ptxas %{ llc < %s | %ptxas-verify %} 5; RUN: %if ptxas %{ llc -disable-nvptx-load-store-vectorizer < %s | %ptxas-verify %} 6 7target triple = "nvptx64-nvidia-cuda" 8 9; Check that the load-store vectorizer is enabled by default for nvptx, and 10; that it's disabled by the appropriate flag. 11 12define i32 @f(ptr %p) { 13; ENABLED-LABEL: f( 14; ENABLED: { 15; ENABLED-NEXT: .reg .b32 %r<4>; 16; ENABLED-NEXT: .reg .b64 %rd<2>; 17; ENABLED-EMPTY: 18; ENABLED-NEXT: // %bb.0: 19; ENABLED-NEXT: ld.param.u64 %rd1, [f_param_0]; 20; ENABLED-NEXT: ld.v2.u32 {%r1, %r2}, [%rd1]; 21; ENABLED-NEXT: add.s32 %r3, %r1, %r2; 22; ENABLED-NEXT: st.param.b32 [func_retval0], %r3; 23; ENABLED-NEXT: ret; 24; 25; DISABLED-LABEL: f( 26; DISABLED: { 27; DISABLED-NEXT: .reg .b32 %r<4>; 28; DISABLED-NEXT: .reg .b64 %rd<2>; 29; DISABLED-EMPTY: 30; DISABLED-NEXT: // %bb.0: 31; DISABLED-NEXT: ld.param.u64 %rd1, [f_param_0]; 32; DISABLED-NEXT: ld.u32 %r1, [%rd1]; 33; DISABLED-NEXT: ld.u32 %r2, [%rd1+4]; 34; DISABLED-NEXT: add.s32 %r3, %r1, %r2; 35; DISABLED-NEXT: st.param.b32 [func_retval0], %r3; 36; DISABLED-NEXT: ret; 37 %p.1 = getelementptr i32, ptr %p, i32 1 38 %v0 = load i32, ptr %p, align 8 39 %v1 = load i32, ptr %p.1, align 4 40 %sum = add i32 %v0, %v1 41 ret i32 %sum 42} 43 44define half @fh(ptr %p) { 45; ENABLED-LABEL: fh( 46; ENABLED: { 47; ENABLED-NEXT: .reg .b16 %rs<10>; 48; ENABLED-NEXT: .reg .f32 %f<13>; 49; ENABLED-NEXT: .reg .b64 %rd<2>; 50; ENABLED-EMPTY: 51; ENABLED-NEXT: // %bb.0: 52; ENABLED-NEXT: ld.param.u64 %rd1, [fh_param_0]; 53; ENABLED-NEXT: ld.v4.b16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; 54; ENABLED-NEXT: ld.b16 %rs5, [%rd1+8]; 55; ENABLED-NEXT: cvt.f32.f16 %f1, %rs2; 56; ENABLED-NEXT: cvt.f32.f16 %f2, %rs1; 57; ENABLED-NEXT: add.rn.f32 %f3, %f2, %f1; 58; ENABLED-NEXT: cvt.rn.f16.f32 %rs6, %f3; 59; ENABLED-NEXT: cvt.f32.f16 %f4, %rs4; 60; ENABLED-NEXT: cvt.f32.f16 %f5, %rs3; 61; ENABLED-NEXT: add.rn.f32 %f6, %f5, %f4; 62; ENABLED-NEXT: cvt.rn.f16.f32 %rs7, %f6; 63; ENABLED-NEXT: cvt.f32.f16 %f7, %rs7; 64; ENABLED-NEXT: cvt.f32.f16 %f8, %rs6; 65; ENABLED-NEXT: add.rn.f32 %f9, %f8, %f7; 66; ENABLED-NEXT: cvt.rn.f16.f32 %rs8, %f9; 67; ENABLED-NEXT: cvt.f32.f16 %f10, %rs8; 68; ENABLED-NEXT: cvt.f32.f16 %f11, %rs5; 69; ENABLED-NEXT: add.rn.f32 %f12, %f10, %f11; 70; ENABLED-NEXT: cvt.rn.f16.f32 %rs9, %f12; 71; ENABLED-NEXT: st.param.b16 [func_retval0], %rs9; 72; ENABLED-NEXT: ret; 73; 74; DISABLED-LABEL: fh( 75; DISABLED: { 76; DISABLED-NEXT: .reg .b16 %rs<10>; 77; DISABLED-NEXT: .reg .f32 %f<13>; 78; DISABLED-NEXT: .reg .b64 %rd<2>; 79; DISABLED-EMPTY: 80; DISABLED-NEXT: // %bb.0: 81; DISABLED-NEXT: ld.param.u64 %rd1, [fh_param_0]; 82; DISABLED-NEXT: ld.b16 %rs1, [%rd1]; 83; DISABLED-NEXT: ld.b16 %rs2, [%rd1+2]; 84; DISABLED-NEXT: ld.b16 %rs3, [%rd1+4]; 85; DISABLED-NEXT: ld.b16 %rs4, [%rd1+6]; 86; DISABLED-NEXT: ld.b16 %rs5, [%rd1+8]; 87; DISABLED-NEXT: cvt.f32.f16 %f1, %rs2; 88; DISABLED-NEXT: cvt.f32.f16 %f2, %rs1; 89; DISABLED-NEXT: add.rn.f32 %f3, %f2, %f1; 90; DISABLED-NEXT: cvt.rn.f16.f32 %rs6, %f3; 91; DISABLED-NEXT: cvt.f32.f16 %f4, %rs4; 92; DISABLED-NEXT: cvt.f32.f16 %f5, %rs3; 93; DISABLED-NEXT: add.rn.f32 %f6, %f5, %f4; 94; DISABLED-NEXT: cvt.rn.f16.f32 %rs7, %f6; 95; DISABLED-NEXT: cvt.f32.f16 %f7, %rs7; 96; DISABLED-NEXT: cvt.f32.f16 %f8, %rs6; 97; DISABLED-NEXT: add.rn.f32 %f9, %f8, %f7; 98; DISABLED-NEXT: cvt.rn.f16.f32 %rs8, %f9; 99; DISABLED-NEXT: cvt.f32.f16 %f10, %rs8; 100; DISABLED-NEXT: cvt.f32.f16 %f11, %rs5; 101; DISABLED-NEXT: add.rn.f32 %f12, %f10, %f11; 102; DISABLED-NEXT: cvt.rn.f16.f32 %rs9, %f12; 103; DISABLED-NEXT: st.param.b16 [func_retval0], %rs9; 104; DISABLED-NEXT: ret; 105 %p.1 = getelementptr half, ptr %p, i32 1 106 %p.2 = getelementptr half, ptr %p, i32 2 107 %p.3 = getelementptr half, ptr %p, i32 3 108 %p.4 = getelementptr half, ptr %p, i32 4 109 %v0 = load half, ptr %p, align 64 110 %v1 = load half, ptr %p.1, align 4 111 %v2 = load half, ptr %p.2, align 4 112 %v3 = load half, ptr %p.3, align 4 113 %v4 = load half, ptr %p.4, align 4 114 %sum1 = fadd half %v0, %v1 115 %sum2 = fadd half %v2, %v3 116 %sum3 = fadd half %sum1, %sum2 117 %sum = fadd half %sum3, %v4 118 ret half %sum 119} 120 121define float @ff(ptr %p) { 122; ENABLED-LABEL: ff( 123; ENABLED: { 124; ENABLED-NEXT: .reg .f32 %f<10>; 125; ENABLED-NEXT: .reg .b64 %rd<2>; 126; ENABLED-EMPTY: 127; ENABLED-NEXT: // %bb.0: 128; ENABLED-NEXT: ld.param.u64 %rd1, [ff_param_0]; 129; ENABLED-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; 130; ENABLED-NEXT: ld.f32 %f5, [%rd1+16]; 131; ENABLED-NEXT: add.rn.f32 %f6, %f1, %f2; 132; ENABLED-NEXT: add.rn.f32 %f7, %f3, %f4; 133; ENABLED-NEXT: add.rn.f32 %f8, %f6, %f7; 134; ENABLED-NEXT: add.rn.f32 %f9, %f8, %f5; 135; ENABLED-NEXT: st.param.f32 [func_retval0], %f9; 136; ENABLED-NEXT: ret; 137; 138; DISABLED-LABEL: ff( 139; DISABLED: { 140; DISABLED-NEXT: .reg .f32 %f<10>; 141; DISABLED-NEXT: .reg .b64 %rd<2>; 142; DISABLED-EMPTY: 143; DISABLED-NEXT: // %bb.0: 144; DISABLED-NEXT: ld.param.u64 %rd1, [ff_param_0]; 145; DISABLED-NEXT: ld.f32 %f1, [%rd1]; 146; DISABLED-NEXT: ld.f32 %f2, [%rd1+4]; 147; DISABLED-NEXT: ld.f32 %f3, [%rd1+8]; 148; DISABLED-NEXT: ld.f32 %f4, [%rd1+12]; 149; DISABLED-NEXT: ld.f32 %f5, [%rd1+16]; 150; DISABLED-NEXT: add.rn.f32 %f6, %f1, %f2; 151; DISABLED-NEXT: add.rn.f32 %f7, %f3, %f4; 152; DISABLED-NEXT: add.rn.f32 %f8, %f6, %f7; 153; DISABLED-NEXT: add.rn.f32 %f9, %f8, %f5; 154; DISABLED-NEXT: st.param.f32 [func_retval0], %f9; 155; DISABLED-NEXT: ret; 156 %p.1 = getelementptr float, ptr %p, i32 1 157 %p.2 = getelementptr float, ptr %p, i32 2 158 %p.3 = getelementptr float, ptr %p, i32 3 159 %p.4 = getelementptr float, ptr %p, i32 4 160 %v0 = load float, ptr %p, align 64 161 %v1 = load float, ptr %p.1, align 4 162 %v2 = load float, ptr %p.2, align 4 163 %v3 = load float, ptr %p.3, align 4 164 %v4 = load float, ptr %p.4, align 4 165 %sum1 = fadd float %v0, %v1 166 %sum2 = fadd float %v2, %v3 167 %sum3 = fadd float %sum1, %sum2 168 %sum = fadd float %sum3, %v4 169 ret float %sum 170} 171 172define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr2) { 173; ENABLED-LABEL: combine_v16i8( 174; ENABLED: { 175; ENABLED-NEXT: .reg .b32 %r<36>; 176; ENABLED-NEXT: .reg .b64 %rd<3>; 177; ENABLED-EMPTY: 178; ENABLED-NEXT: // %bb.0: 179; ENABLED-NEXT: ld.param.u64 %rd1, [combine_v16i8_param_0]; 180; ENABLED-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; 181; ENABLED-NEXT: ld.param.u64 %rd2, [combine_v16i8_param_1]; 182; ENABLED-NEXT: bfe.u32 %r5, %r1, 0, 8; 183; ENABLED-NEXT: bfe.u32 %r6, %r1, 8, 8; 184; ENABLED-NEXT: bfe.u32 %r7, %r1, 16, 8; 185; ENABLED-NEXT: bfe.u32 %r8, %r1, 24, 8; 186; ENABLED-NEXT: bfe.u32 %r9, %r2, 0, 8; 187; ENABLED-NEXT: bfe.u32 %r10, %r2, 8, 8; 188; ENABLED-NEXT: bfe.u32 %r11, %r2, 16, 8; 189; ENABLED-NEXT: bfe.u32 %r12, %r2, 24, 8; 190; ENABLED-NEXT: bfe.u32 %r13, %r3, 0, 8; 191; ENABLED-NEXT: bfe.u32 %r14, %r3, 8, 8; 192; ENABLED-NEXT: bfe.u32 %r15, %r3, 16, 8; 193; ENABLED-NEXT: bfe.u32 %r16, %r3, 24, 8; 194; ENABLED-NEXT: bfe.u32 %r17, %r4, 0, 8; 195; ENABLED-NEXT: bfe.u32 %r18, %r4, 8, 8; 196; ENABLED-NEXT: bfe.u32 %r19, %r4, 16, 8; 197; ENABLED-NEXT: bfe.u32 %r20, %r4, 24, 8; 198; ENABLED-NEXT: add.s32 %r21, %r5, %r6; 199; ENABLED-NEXT: add.s32 %r22, %r21, %r7; 200; ENABLED-NEXT: add.s32 %r23, %r22, %r8; 201; ENABLED-NEXT: add.s32 %r24, %r23, %r9; 202; ENABLED-NEXT: add.s32 %r25, %r24, %r10; 203; ENABLED-NEXT: add.s32 %r26, %r25, %r11; 204; ENABLED-NEXT: add.s32 %r27, %r26, %r12; 205; ENABLED-NEXT: add.s32 %r28, %r27, %r13; 206; ENABLED-NEXT: add.s32 %r29, %r28, %r14; 207; ENABLED-NEXT: add.s32 %r30, %r29, %r15; 208; ENABLED-NEXT: add.s32 %r31, %r30, %r16; 209; ENABLED-NEXT: add.s32 %r32, %r31, %r17; 210; ENABLED-NEXT: add.s32 %r33, %r32, %r18; 211; ENABLED-NEXT: add.s32 %r34, %r33, %r19; 212; ENABLED-NEXT: add.s32 %r35, %r34, %r20; 213; ENABLED-NEXT: st.u32 [%rd2], %r35; 214; ENABLED-NEXT: ret; 215; 216; DISABLED-LABEL: combine_v16i8( 217; DISABLED: { 218; DISABLED-NEXT: .reg .b32 %r<32>; 219; DISABLED-NEXT: .reg .b64 %rd<3>; 220; DISABLED-EMPTY: 221; DISABLED-NEXT: // %bb.0: 222; DISABLED-NEXT: ld.param.u64 %rd1, [combine_v16i8_param_0]; 223; DISABLED-NEXT: ld.u8 %r1, [%rd1]; 224; DISABLED-NEXT: ld.param.u64 %rd2, [combine_v16i8_param_1]; 225; DISABLED-NEXT: ld.u8 %r2, [%rd1+1]; 226; DISABLED-NEXT: ld.u8 %r3, [%rd1+2]; 227; DISABLED-NEXT: ld.u8 %r4, [%rd1+3]; 228; DISABLED-NEXT: ld.u8 %r5, [%rd1+4]; 229; DISABLED-NEXT: ld.u8 %r6, [%rd1+5]; 230; DISABLED-NEXT: ld.u8 %r7, [%rd1+6]; 231; DISABLED-NEXT: ld.u8 %r8, [%rd1+7]; 232; DISABLED-NEXT: ld.u8 %r9, [%rd1+8]; 233; DISABLED-NEXT: ld.u8 %r10, [%rd1+9]; 234; DISABLED-NEXT: ld.u8 %r11, [%rd1+10]; 235; DISABLED-NEXT: ld.u8 %r12, [%rd1+11]; 236; DISABLED-NEXT: ld.u8 %r13, [%rd1+12]; 237; DISABLED-NEXT: ld.u8 %r14, [%rd1+13]; 238; DISABLED-NEXT: ld.u8 %r15, [%rd1+14]; 239; DISABLED-NEXT: ld.u8 %r16, [%rd1+15]; 240; DISABLED-NEXT: add.s32 %r17, %r1, %r2; 241; DISABLED-NEXT: add.s32 %r18, %r17, %r3; 242; DISABLED-NEXT: add.s32 %r19, %r18, %r4; 243; DISABLED-NEXT: add.s32 %r20, %r19, %r5; 244; DISABLED-NEXT: add.s32 %r21, %r20, %r6; 245; DISABLED-NEXT: add.s32 %r22, %r21, %r7; 246; DISABLED-NEXT: add.s32 %r23, %r22, %r8; 247; DISABLED-NEXT: add.s32 %r24, %r23, %r9; 248; DISABLED-NEXT: add.s32 %r25, %r24, %r10; 249; DISABLED-NEXT: add.s32 %r26, %r25, %r11; 250; DISABLED-NEXT: add.s32 %r27, %r26, %r12; 251; DISABLED-NEXT: add.s32 %r28, %r27, %r13; 252; DISABLED-NEXT: add.s32 %r29, %r28, %r14; 253; DISABLED-NEXT: add.s32 %r30, %r29, %r15; 254; DISABLED-NEXT: add.s32 %r31, %r30, %r16; 255; DISABLED-NEXT: st.u32 [%rd2], %r31; 256; DISABLED-NEXT: ret; 257 %val0 = load i8, ptr %ptr1, align 16 258 %ptr1.1 = getelementptr inbounds i8, ptr %ptr1, i64 1 259 %val1 = load i8, ptr %ptr1.1, align 1 260 %ptr1.2 = getelementptr inbounds i8, ptr %ptr1, i64 2 261 %val2 = load i8, ptr %ptr1.2, align 2 262 %ptr1.3 = getelementptr inbounds i8, ptr %ptr1, i64 3 263 %val3 = load i8, ptr %ptr1.3, align 1 264 %ptr1.4 = getelementptr inbounds i8, ptr %ptr1, i64 4 265 %val4 = load i8, ptr %ptr1.4, align 4 266 %ptr1.5 = getelementptr inbounds i8, ptr %ptr1, i64 5 267 %val5 = load i8, ptr %ptr1.5, align 1 268 %ptr1.6 = getelementptr inbounds i8, ptr %ptr1, i64 6 269 %val6 = load i8, ptr %ptr1.6, align 2 270 %ptr1.7 = getelementptr inbounds i8, ptr %ptr1, i64 7 271 %val7 = load i8, ptr %ptr1.7, align 1 272 %ptr1.8 = getelementptr inbounds i8, ptr %ptr1, i64 8 273 %val8 = load i8, ptr %ptr1.8, align 8 274 %ptr1.9 = getelementptr inbounds i8, ptr %ptr1, i64 9 275 %val9 = load i8, ptr %ptr1.9, align 1 276 %ptr1.10 = getelementptr inbounds i8, ptr %ptr1, i64 10 277 %val10 = load i8, ptr %ptr1.10, align 2 278 %ptr1.11 = getelementptr inbounds i8, ptr %ptr1, i64 11 279 %val11 = load i8, ptr %ptr1.11, align 1 280 %ptr1.12 = getelementptr inbounds i8, ptr %ptr1, i64 12 281 %val12 = load i8, ptr %ptr1.12, align 4 282 %ptr1.13 = getelementptr inbounds i8, ptr %ptr1, i64 13 283 %val13 = load i8, ptr %ptr1.13, align 1 284 %ptr1.14 = getelementptr inbounds i8, ptr %ptr1, i64 14 285 %val14 = load i8, ptr %ptr1.14, align 2 286 %ptr1.15 = getelementptr inbounds i8, ptr %ptr1, i64 15 287 %val15 = load i8, ptr %ptr1.15, align 1 288 %lane0 = zext i8 %val0 to i32 289 %lane1 = zext i8 %val1 to i32 290 %lane2 = zext i8 %val2 to i32 291 %lane3 = zext i8 %val3 to i32 292 %lane4 = zext i8 %val4 to i32 293 %lane5 = zext i8 %val5 to i32 294 %lane6 = zext i8 %val6 to i32 295 %lane7 = zext i8 %val7 to i32 296 %lane8 = zext i8 %val8 to i32 297 %lane9 = zext i8 %val9 to i32 298 %lane10 = zext i8 %val10 to i32 299 %lane11 = zext i8 %val11 to i32 300 %lane12 = zext i8 %val12 to i32 301 %lane13 = zext i8 %val13 to i32 302 %lane14 = zext i8 %val14 to i32 303 %lane15 = zext i8 %val15 to i32 304 %red.1 = add i32 %lane0, %lane1 305 %red.2 = add i32 %red.1, %lane2 306 %red.3 = add i32 %red.2, %lane3 307 %red.4 = add i32 %red.3, %lane4 308 %red.5 = add i32 %red.4, %lane5 309 %red.6 = add i32 %red.5, %lane6 310 %red.7 = add i32 %red.6, %lane7 311 %red.8 = add i32 %red.7, %lane8 312 %red.9 = add i32 %red.8, %lane9 313 %red.10 = add i32 %red.9, %lane10 314 %red.11 = add i32 %red.10, %lane11 315 %red.12 = add i32 %red.11, %lane12 316 %red.13 = add i32 %red.12, %lane13 317 %red.14 = add i32 %red.13, %lane14 318 %red = add i32 %red.14, %lane15 319 store i32 %red, ptr %ptr2, align 4 320 ret void 321} 322 323define void @combine_v16i8_unaligned(ptr noundef align 8 %ptr1, ptr noundef align 16 %ptr2) { 324; ENABLED-LABEL: combine_v16i8_unaligned( 325; ENABLED: { 326; ENABLED-NEXT: .reg .b32 %r<36>; 327; ENABLED-NEXT: .reg .b64 %rd<3>; 328; ENABLED-EMPTY: 329; ENABLED-NEXT: // %bb.0: 330; ENABLED-NEXT: ld.param.u64 %rd1, [combine_v16i8_unaligned_param_0]; 331; ENABLED-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1]; 332; ENABLED-NEXT: ld.param.u64 %rd2, [combine_v16i8_unaligned_param_1]; 333; ENABLED-NEXT: ld.v2.b32 {%r3, %r4}, [%rd1+8]; 334; ENABLED-NEXT: bfe.u32 %r5, %r1, 0, 8; 335; ENABLED-NEXT: bfe.u32 %r6, %r1, 8, 8; 336; ENABLED-NEXT: bfe.u32 %r7, %r1, 16, 8; 337; ENABLED-NEXT: bfe.u32 %r8, %r1, 24, 8; 338; ENABLED-NEXT: bfe.u32 %r9, %r2, 0, 8; 339; ENABLED-NEXT: bfe.u32 %r10, %r2, 8, 8; 340; ENABLED-NEXT: bfe.u32 %r11, %r2, 16, 8; 341; ENABLED-NEXT: bfe.u32 %r12, %r2, 24, 8; 342; ENABLED-NEXT: bfe.u32 %r13, %r3, 0, 8; 343; ENABLED-NEXT: bfe.u32 %r14, %r3, 8, 8; 344; ENABLED-NEXT: bfe.u32 %r15, %r3, 16, 8; 345; ENABLED-NEXT: bfe.u32 %r16, %r3, 24, 8; 346; ENABLED-NEXT: bfe.u32 %r17, %r4, 0, 8; 347; ENABLED-NEXT: bfe.u32 %r18, %r4, 8, 8; 348; ENABLED-NEXT: bfe.u32 %r19, %r4, 16, 8; 349; ENABLED-NEXT: bfe.u32 %r20, %r4, 24, 8; 350; ENABLED-NEXT: add.s32 %r21, %r5, %r6; 351; ENABLED-NEXT: add.s32 %r22, %r21, %r7; 352; ENABLED-NEXT: add.s32 %r23, %r22, %r8; 353; ENABLED-NEXT: add.s32 %r24, %r23, %r9; 354; ENABLED-NEXT: add.s32 %r25, %r24, %r10; 355; ENABLED-NEXT: add.s32 %r26, %r25, %r11; 356; ENABLED-NEXT: add.s32 %r27, %r26, %r12; 357; ENABLED-NEXT: add.s32 %r28, %r27, %r13; 358; ENABLED-NEXT: add.s32 %r29, %r28, %r14; 359; ENABLED-NEXT: add.s32 %r30, %r29, %r15; 360; ENABLED-NEXT: add.s32 %r31, %r30, %r16; 361; ENABLED-NEXT: add.s32 %r32, %r31, %r17; 362; ENABLED-NEXT: add.s32 %r33, %r32, %r18; 363; ENABLED-NEXT: add.s32 %r34, %r33, %r19; 364; ENABLED-NEXT: add.s32 %r35, %r34, %r20; 365; ENABLED-NEXT: st.u32 [%rd2], %r35; 366; ENABLED-NEXT: ret; 367; 368; DISABLED-LABEL: combine_v16i8_unaligned( 369; DISABLED: { 370; DISABLED-NEXT: .reg .b32 %r<32>; 371; DISABLED-NEXT: .reg .b64 %rd<3>; 372; DISABLED-EMPTY: 373; DISABLED-NEXT: // %bb.0: 374; DISABLED-NEXT: ld.param.u64 %rd1, [combine_v16i8_unaligned_param_0]; 375; DISABLED-NEXT: ld.u8 %r1, [%rd1]; 376; DISABLED-NEXT: ld.param.u64 %rd2, [combine_v16i8_unaligned_param_1]; 377; DISABLED-NEXT: ld.u8 %r2, [%rd1+1]; 378; DISABLED-NEXT: ld.u8 %r3, [%rd1+2]; 379; DISABLED-NEXT: ld.u8 %r4, [%rd1+3]; 380; DISABLED-NEXT: ld.u8 %r5, [%rd1+4]; 381; DISABLED-NEXT: ld.u8 %r6, [%rd1+5]; 382; DISABLED-NEXT: ld.u8 %r7, [%rd1+6]; 383; DISABLED-NEXT: ld.u8 %r8, [%rd1+7]; 384; DISABLED-NEXT: ld.u8 %r9, [%rd1+8]; 385; DISABLED-NEXT: ld.u8 %r10, [%rd1+9]; 386; DISABLED-NEXT: ld.u8 %r11, [%rd1+10]; 387; DISABLED-NEXT: ld.u8 %r12, [%rd1+11]; 388; DISABLED-NEXT: ld.u8 %r13, [%rd1+12]; 389; DISABLED-NEXT: ld.u8 %r14, [%rd1+13]; 390; DISABLED-NEXT: ld.u8 %r15, [%rd1+14]; 391; DISABLED-NEXT: ld.u8 %r16, [%rd1+15]; 392; DISABLED-NEXT: add.s32 %r17, %r1, %r2; 393; DISABLED-NEXT: add.s32 %r18, %r17, %r3; 394; DISABLED-NEXT: add.s32 %r19, %r18, %r4; 395; DISABLED-NEXT: add.s32 %r20, %r19, %r5; 396; DISABLED-NEXT: add.s32 %r21, %r20, %r6; 397; DISABLED-NEXT: add.s32 %r22, %r21, %r7; 398; DISABLED-NEXT: add.s32 %r23, %r22, %r8; 399; DISABLED-NEXT: add.s32 %r24, %r23, %r9; 400; DISABLED-NEXT: add.s32 %r25, %r24, %r10; 401; DISABLED-NEXT: add.s32 %r26, %r25, %r11; 402; DISABLED-NEXT: add.s32 %r27, %r26, %r12; 403; DISABLED-NEXT: add.s32 %r28, %r27, %r13; 404; DISABLED-NEXT: add.s32 %r29, %r28, %r14; 405; DISABLED-NEXT: add.s32 %r30, %r29, %r15; 406; DISABLED-NEXT: add.s32 %r31, %r30, %r16; 407; DISABLED-NEXT: st.u32 [%rd2], %r31; 408; DISABLED-NEXT: ret; 409 %val0 = load i8, ptr %ptr1, align 8 410 %ptr1.1 = getelementptr inbounds i8, ptr %ptr1, i64 1 411 %val1 = load i8, ptr %ptr1.1, align 1 412 %ptr1.2 = getelementptr inbounds i8, ptr %ptr1, i64 2 413 %val2 = load i8, ptr %ptr1.2, align 2 414 %ptr1.3 = getelementptr inbounds i8, ptr %ptr1, i64 3 415 %val3 = load i8, ptr %ptr1.3, align 1 416 %ptr1.4 = getelementptr inbounds i8, ptr %ptr1, i64 4 417 %val4 = load i8, ptr %ptr1.4, align 4 418 %ptr1.5 = getelementptr inbounds i8, ptr %ptr1, i64 5 419 %val5 = load i8, ptr %ptr1.5, align 1 420 %ptr1.6 = getelementptr inbounds i8, ptr %ptr1, i64 6 421 %val6 = load i8, ptr %ptr1.6, align 2 422 %ptr1.7 = getelementptr inbounds i8, ptr %ptr1, i64 7 423 %val7 = load i8, ptr %ptr1.7, align 1 424 %ptr1.8 = getelementptr inbounds i8, ptr %ptr1, i64 8 425 %val8 = load i8, ptr %ptr1.8, align 8 426 %ptr1.9 = getelementptr inbounds i8, ptr %ptr1, i64 9 427 %val9 = load i8, ptr %ptr1.9, align 1 428 %ptr1.10 = getelementptr inbounds i8, ptr %ptr1, i64 10 429 %val10 = load i8, ptr %ptr1.10, align 2 430 %ptr1.11 = getelementptr inbounds i8, ptr %ptr1, i64 11 431 %val11 = load i8, ptr %ptr1.11, align 1 432 %ptr1.12 = getelementptr inbounds i8, ptr %ptr1, i64 12 433 %val12 = load i8, ptr %ptr1.12, align 4 434 %ptr1.13 = getelementptr inbounds i8, ptr %ptr1, i64 13 435 %val13 = load i8, ptr %ptr1.13, align 1 436 %ptr1.14 = getelementptr inbounds i8, ptr %ptr1, i64 14 437 %val14 = load i8, ptr %ptr1.14, align 2 438 %ptr1.15 = getelementptr inbounds i8, ptr %ptr1, i64 15 439 %val15 = load i8, ptr %ptr1.15, align 1 440 %lane0 = zext i8 %val0 to i32 441 %lane1 = zext i8 %val1 to i32 442 %lane2 = zext i8 %val2 to i32 443 %lane3 = zext i8 %val3 to i32 444 %lane4 = zext i8 %val4 to i32 445 %lane5 = zext i8 %val5 to i32 446 %lane6 = zext i8 %val6 to i32 447 %lane7 = zext i8 %val7 to i32 448 %lane8 = zext i8 %val8 to i32 449 %lane9 = zext i8 %val9 to i32 450 %lane10 = zext i8 %val10 to i32 451 %lane11 = zext i8 %val11 to i32 452 %lane12 = zext i8 %val12 to i32 453 %lane13 = zext i8 %val13 to i32 454 %lane14 = zext i8 %val14 to i32 455 %lane15 = zext i8 %val15 to i32 456 %red.1 = add i32 %lane0, %lane1 457 %red.2 = add i32 %red.1, %lane2 458 %red.3 = add i32 %red.2, %lane3 459 %red.4 = add i32 %red.3, %lane4 460 %red.5 = add i32 %red.4, %lane5 461 %red.6 = add i32 %red.5, %lane6 462 %red.7 = add i32 %red.6, %lane7 463 %red.8 = add i32 %red.7, %lane8 464 %red.9 = add i32 %red.8, %lane9 465 %red.10 = add i32 %red.9, %lane10 466 %red.11 = add i32 %red.10, %lane11 467 %red.12 = add i32 %red.11, %lane12 468 %red.13 = add i32 %red.12, %lane13 469 %red.14 = add i32 %red.13, %lane14 470 %red = add i32 %red.14, %lane15 471 store i32 %red, ptr %ptr2, align 4 472 ret void 473} 474 475 476define void @combine_v8i16(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr2) { 477; ENABLED-LABEL: combine_v8i16( 478; ENABLED: { 479; ENABLED-NEXT: .reg .b16 %rs<9>; 480; ENABLED-NEXT: .reg .b32 %r<20>; 481; ENABLED-NEXT: .reg .b64 %rd<3>; 482; ENABLED-EMPTY: 483; ENABLED-NEXT: // %bb.0: 484; ENABLED-NEXT: ld.param.u64 %rd1, [combine_v8i16_param_0]; 485; ENABLED-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; 486; ENABLED-NEXT: mov.b32 {%rs1, %rs2}, %r4; 487; ENABLED-NEXT: mov.b32 {%rs3, %rs4}, %r3; 488; ENABLED-NEXT: mov.b32 {%rs5, %rs6}, %r2; 489; ENABLED-NEXT: mov.b32 {%rs7, %rs8}, %r1; 490; ENABLED-NEXT: ld.param.u64 %rd2, [combine_v8i16_param_1]; 491; ENABLED-NEXT: cvt.u32.u16 %r5, %rs7; 492; ENABLED-NEXT: cvt.u32.u16 %r6, %rs8; 493; ENABLED-NEXT: cvt.u32.u16 %r7, %rs5; 494; ENABLED-NEXT: cvt.u32.u16 %r8, %rs6; 495; ENABLED-NEXT: cvt.u32.u16 %r9, %rs3; 496; ENABLED-NEXT: cvt.u32.u16 %r10, %rs4; 497; ENABLED-NEXT: cvt.u32.u16 %r11, %rs1; 498; ENABLED-NEXT: cvt.u32.u16 %r12, %rs2; 499; ENABLED-NEXT: add.s32 %r13, %r5, %r6; 500; ENABLED-NEXT: add.s32 %r14, %r13, %r7; 501; ENABLED-NEXT: add.s32 %r15, %r14, %r8; 502; ENABLED-NEXT: add.s32 %r16, %r15, %r9; 503; ENABLED-NEXT: add.s32 %r17, %r16, %r10; 504; ENABLED-NEXT: add.s32 %r18, %r17, %r11; 505; ENABLED-NEXT: add.s32 %r19, %r18, %r12; 506; ENABLED-NEXT: st.u32 [%rd2], %r19; 507; ENABLED-NEXT: ret; 508; 509; DISABLED-LABEL: combine_v8i16( 510; DISABLED: { 511; DISABLED-NEXT: .reg .b32 %r<16>; 512; DISABLED-NEXT: .reg .b64 %rd<3>; 513; DISABLED-EMPTY: 514; DISABLED-NEXT: // %bb.0: 515; DISABLED-NEXT: ld.param.u64 %rd1, [combine_v8i16_param_0]; 516; DISABLED-NEXT: ld.u16 %r1, [%rd1]; 517; DISABLED-NEXT: ld.param.u64 %rd2, [combine_v8i16_param_1]; 518; DISABLED-NEXT: ld.u16 %r2, [%rd1+2]; 519; DISABLED-NEXT: ld.u16 %r3, [%rd1+4]; 520; DISABLED-NEXT: ld.u16 %r4, [%rd1+6]; 521; DISABLED-NEXT: ld.u16 %r5, [%rd1+8]; 522; DISABLED-NEXT: ld.u16 %r6, [%rd1+10]; 523; DISABLED-NEXT: ld.u16 %r7, [%rd1+12]; 524; DISABLED-NEXT: ld.u16 %r8, [%rd1+14]; 525; DISABLED-NEXT: add.s32 %r9, %r1, %r2; 526; DISABLED-NEXT: add.s32 %r10, %r9, %r3; 527; DISABLED-NEXT: add.s32 %r11, %r10, %r4; 528; DISABLED-NEXT: add.s32 %r12, %r11, %r5; 529; DISABLED-NEXT: add.s32 %r13, %r12, %r6; 530; DISABLED-NEXT: add.s32 %r14, %r13, %r7; 531; DISABLED-NEXT: add.s32 %r15, %r14, %r8; 532; DISABLED-NEXT: st.u32 [%rd2], %r15; 533; DISABLED-NEXT: ret; 534 %val0 = load i16, ptr %ptr1, align 16 535 %ptr1.1 = getelementptr inbounds i16, ptr %ptr1, i64 1 536 %val1 = load i16, ptr %ptr1.1, align 2 537 %ptr1.2 = getelementptr inbounds i16, ptr %ptr1, i64 2 538 %val2 = load i16, ptr %ptr1.2, align 4 539 %ptr1.3 = getelementptr inbounds i16, ptr %ptr1, i64 3 540 %val3 = load i16, ptr %ptr1.3, align 2 541 %ptr1.4 = getelementptr inbounds i16, ptr %ptr1, i64 4 542 %val4 = load i16, ptr %ptr1.4, align 4 543 %ptr1.5 = getelementptr inbounds i16, ptr %ptr1, i64 5 544 %val5 = load i16, ptr %ptr1.5, align 2 545 %ptr1.6 = getelementptr inbounds i16, ptr %ptr1, i64 6 546 %val6 = load i16, ptr %ptr1.6, align 4 547 %ptr1.7 = getelementptr inbounds i16, ptr %ptr1, i64 7 548 %val7 = load i16, ptr %ptr1.7, align 2 549 %lane0 = zext i16 %val0 to i32 550 %lane1 = zext i16 %val1 to i32 551 %lane2 = zext i16 %val2 to i32 552 %lane3 = zext i16 %val3 to i32 553 %lane4 = zext i16 %val4 to i32 554 %lane5 = zext i16 %val5 to i32 555 %lane6 = zext i16 %val6 to i32 556 %lane7 = zext i16 %val7 to i32 557 %red.1 = add i32 %lane0, %lane1 558 %red.2 = add i32 %red.1, %lane2 559 %red.3 = add i32 %red.2, %lane3 560 %red.4 = add i32 %red.3, %lane4 561 %red.5 = add i32 %red.4, %lane5 562 %red.6 = add i32 %red.5, %lane6 563 %red = add i32 %red.6, %lane7 564 store i32 %red, ptr %ptr2, align 4 565 ret void 566} 567 568define void @combine_v4i32(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr2) { 569; ENABLED-LABEL: combine_v4i32( 570; ENABLED: { 571; ENABLED-NEXT: .reg .b32 %r<8>; 572; ENABLED-NEXT: .reg .b64 %rd<3>; 573; ENABLED-EMPTY: 574; ENABLED-NEXT: // %bb.0: 575; ENABLED-NEXT: ld.param.u64 %rd1, [combine_v4i32_param_0]; 576; ENABLED-NEXT: ld.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; 577; ENABLED-NEXT: ld.param.u64 %rd2, [combine_v4i32_param_1]; 578; ENABLED-NEXT: add.s32 %r5, %r1, %r2; 579; ENABLED-NEXT: add.s32 %r6, %r5, %r3; 580; ENABLED-NEXT: add.s32 %r7, %r6, %r4; 581; ENABLED-NEXT: st.u32 [%rd2], %r7; 582; ENABLED-NEXT: ret; 583; 584; DISABLED-LABEL: combine_v4i32( 585; DISABLED: { 586; DISABLED-NEXT: .reg .b32 %r<8>; 587; DISABLED-NEXT: .reg .b64 %rd<3>; 588; DISABLED-EMPTY: 589; DISABLED-NEXT: // %bb.0: 590; DISABLED-NEXT: ld.param.u64 %rd1, [combine_v4i32_param_0]; 591; DISABLED-NEXT: ld.u32 %r1, [%rd1]; 592; DISABLED-NEXT: ld.param.u64 %rd2, [combine_v4i32_param_1]; 593; DISABLED-NEXT: ld.u32 %r2, [%rd1+4]; 594; DISABLED-NEXT: ld.u32 %r3, [%rd1+8]; 595; DISABLED-NEXT: ld.u32 %r4, [%rd1+12]; 596; DISABLED-NEXT: add.s32 %r5, %r1, %r2; 597; DISABLED-NEXT: add.s32 %r6, %r5, %r3; 598; DISABLED-NEXT: add.s32 %r7, %r6, %r4; 599; DISABLED-NEXT: st.u32 [%rd2], %r7; 600; DISABLED-NEXT: ret; 601 %val0 = load i32, ptr %ptr1, align 16 602 %ptr1.1 = getelementptr inbounds i32, ptr %ptr1, i64 1 603 %val1 = load i32, ptr %ptr1.1, align 4 604 %ptr1.2 = getelementptr inbounds i32, ptr %ptr1, i64 2 605 %val2 = load i32, ptr %ptr1.2, align 8 606 %ptr1.3 = getelementptr inbounds i32, ptr %ptr1, i64 3 607 %val3 = load i32, ptr %ptr1.3, align 4 608 %red.1 = add i32 %val0, %val1 609 %red.2 = add i32 %red.1, %val2 610 %red = add i32 %red.2, %val3 611 store i32 %red, ptr %ptr2, align 4 612 ret void 613} 614