1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 2; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s 3; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} 4 5; TODO: add i1, and <6 x i8> vector tests. 6 7; TODO: add test for vectors that exceed 128-bit length 8; Per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors 9; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. 10 11; TODO: generate PTX that preserves Concurrent Forward Progress 12; for atomic operations to local statespace 13; by generating atomic or volatile operations. 14 15; TODO: design exposure for atomic operations on vector types. 16 17; TODO: add weak,atomic,volatile,atomic volatile tests 18; for .const and .param statespaces. 19 20;; generic statespace 21 22; generic 23 24; TODO: make the lowering of this weak vector ops consistent with 25; the ones of the next tests. This test lowers to a weak PTX 26; vector op, but next test lowers to a vector PTX op. 27define void @generic_2xi8(ptr %a) { 28; CHECK-LABEL: generic_2xi8( 29; CHECK: { 30; CHECK-NEXT: .reg .b16 %rs<5>; 31; CHECK-NEXT: .reg .b64 %rd<2>; 32; CHECK-EMPTY: 33; CHECK-NEXT: // %bb.0: 34; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xi8_param_0]; 35; CHECK-NEXT: ld.v2.u8 {%rs1, %rs2}, [%rd1]; 36; CHECK-NEXT: add.s16 %rs3, %rs2, 1; 37; CHECK-NEXT: add.s16 %rs4, %rs1, 1; 38; CHECK-NEXT: st.v2.u8 [%rd1], {%rs4, %rs3}; 39; CHECK-NEXT: ret; 40 %a.load = load <2 x i8>, ptr %a 41 %a.add = add <2 x i8> %a.load, <i8 1, i8 1> 42 store <2 x i8> %a.add, ptr %a 43 ret void 44} 45 46; TODO: make the lowering of this weak vector ops consistent with 47; the ones of the previous test. This test lowers to a weak 48; PTX scalar op, but prior test lowers to a vector PTX op. 49define void @generic_4xi8(ptr %a) { 50; CHECK-LABEL: generic_4xi8( 51; CHECK: { 52; CHECK-NEXT: .reg .b16 %rs<9>; 53; CHECK-NEXT: .reg .b32 %r<13>; 54; CHECK-NEXT: .reg .b64 %rd<2>; 55; CHECK-EMPTY: 56; CHECK-NEXT: // %bb.0: 57; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xi8_param_0]; 58; CHECK-NEXT: ld.u32 %r1, [%rd1]; 59; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; 60; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; 61; CHECK-NEXT: add.s16 %rs2, %rs1, 1; 62; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; 63; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; 64; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; 65; CHECK-NEXT: add.s16 %rs4, %rs3, 1; 66; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; 67; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; 68; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; 69; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; 70; CHECK-NEXT: add.s16 %rs6, %rs5, 1; 71; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; 72; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; 73; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; 74; CHECK-NEXT: add.s16 %rs8, %rs7, 1; 75; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; 76; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; 77; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; 78; CHECK-NEXT: st.u32 [%rd1], %r12; 79; CHECK-NEXT: ret; 80 %a.load = load <4 x i8>, ptr %a 81 %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1> 82 store <4 x i8> %a.add, ptr %a 83 ret void 84} 85 86define void @generic_8xi8(ptr %a) { 87; CHECK-LABEL: generic_8xi8( 88; CHECK: { 89; CHECK-NEXT: .reg .b16 %rs<17>; 90; CHECK-NEXT: .reg .b32 %r<25>; 91; CHECK-NEXT: .reg .b64 %rd<2>; 92; CHECK-EMPTY: 93; CHECK-NEXT: // %bb.0: 94; CHECK-NEXT: ld.param.u64 %rd1, [generic_8xi8_param_0]; 95; CHECK-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1]; 96; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; 97; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; 98; CHECK-NEXT: add.s16 %rs2, %rs1, 1; 99; CHECK-NEXT: cvt.u32.u16 %r4, %rs2; 100; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; 101; CHECK-NEXT: cvt.u16.u32 %rs3, %r5; 102; CHECK-NEXT: add.s16 %rs4, %rs3, 1; 103; CHECK-NEXT: cvt.u32.u16 %r6, %rs4; 104; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U; 105; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; 106; CHECK-NEXT: cvt.u16.u32 %rs5, %r8; 107; CHECK-NEXT: add.s16 %rs6, %rs5, 1; 108; CHECK-NEXT: cvt.u32.u16 %r9, %rs6; 109; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; 110; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; 111; CHECK-NEXT: add.s16 %rs8, %rs7, 1; 112; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; 113; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U; 114; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U; 115; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; 116; CHECK-NEXT: cvt.u16.u32 %rs9, %r14; 117; CHECK-NEXT: add.s16 %rs10, %rs9, 1; 118; CHECK-NEXT: cvt.u32.u16 %r15, %rs10; 119; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8; 120; CHECK-NEXT: cvt.u16.u32 %rs11, %r16; 121; CHECK-NEXT: add.s16 %rs12, %rs11, 1; 122; CHECK-NEXT: cvt.u32.u16 %r17, %rs12; 123; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U; 124; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8; 125; CHECK-NEXT: cvt.u16.u32 %rs13, %r19; 126; CHECK-NEXT: add.s16 %rs14, %rs13, 1; 127; CHECK-NEXT: cvt.u32.u16 %r20, %rs14; 128; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8; 129; CHECK-NEXT: cvt.u16.u32 %rs15, %r21; 130; CHECK-NEXT: add.s16 %rs16, %rs15, 1; 131; CHECK-NEXT: cvt.u32.u16 %r22, %rs16; 132; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U; 133; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U; 134; CHECK-NEXT: st.v2.b32 [%rd1], {%r24, %r13}; 135; CHECK-NEXT: ret; 136 %a.load = load <8 x i8>, ptr %a 137 %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 138 store <8 x i8> %a.add, ptr %a 139 ret void 140} 141 142define void @generic_16xi8(ptr %a) { 143; CHECK-LABEL: generic_16xi8( 144; CHECK: { 145; CHECK-NEXT: .reg .b16 %rs<33>; 146; CHECK-NEXT: .reg .b32 %r<49>; 147; CHECK-NEXT: .reg .b64 %rd<2>; 148; CHECK-EMPTY: 149; CHECK-NEXT: // %bb.0: 150; CHECK-NEXT: ld.param.u64 %rd1, [generic_16xi8_param_0]; 151; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; 152; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; 153; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; 154; CHECK-NEXT: add.s16 %rs2, %rs1, 1; 155; CHECK-NEXT: cvt.u32.u16 %r6, %rs2; 156; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8; 157; CHECK-NEXT: cvt.u16.u32 %rs3, %r7; 158; CHECK-NEXT: add.s16 %rs4, %rs3, 1; 159; CHECK-NEXT: cvt.u32.u16 %r8, %rs4; 160; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U; 161; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8; 162; CHECK-NEXT: cvt.u16.u32 %rs5, %r10; 163; CHECK-NEXT: add.s16 %rs6, %rs5, 1; 164; CHECK-NEXT: cvt.u32.u16 %r11, %rs6; 165; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8; 166; CHECK-NEXT: cvt.u16.u32 %rs7, %r12; 167; CHECK-NEXT: add.s16 %rs8, %rs7, 1; 168; CHECK-NEXT: cvt.u32.u16 %r13, %rs8; 169; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U; 170; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U; 171; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8; 172; CHECK-NEXT: cvt.u16.u32 %rs9, %r16; 173; CHECK-NEXT: add.s16 %rs10, %rs9, 1; 174; CHECK-NEXT: cvt.u32.u16 %r17, %rs10; 175; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8; 176; CHECK-NEXT: cvt.u16.u32 %rs11, %r18; 177; CHECK-NEXT: add.s16 %rs12, %rs11, 1; 178; CHECK-NEXT: cvt.u32.u16 %r19, %rs12; 179; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U; 180; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8; 181; CHECK-NEXT: cvt.u16.u32 %rs13, %r21; 182; CHECK-NEXT: add.s16 %rs14, %rs13, 1; 183; CHECK-NEXT: cvt.u32.u16 %r22, %rs14; 184; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8; 185; CHECK-NEXT: cvt.u16.u32 %rs15, %r23; 186; CHECK-NEXT: add.s16 %rs16, %rs15, 1; 187; CHECK-NEXT: cvt.u32.u16 %r24, %rs16; 188; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U; 189; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U; 190; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8; 191; CHECK-NEXT: cvt.u16.u32 %rs17, %r27; 192; CHECK-NEXT: add.s16 %rs18, %rs17, 1; 193; CHECK-NEXT: cvt.u32.u16 %r28, %rs18; 194; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8; 195; CHECK-NEXT: cvt.u16.u32 %rs19, %r29; 196; CHECK-NEXT: add.s16 %rs20, %rs19, 1; 197; CHECK-NEXT: cvt.u32.u16 %r30, %rs20; 198; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U; 199; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8; 200; CHECK-NEXT: cvt.u16.u32 %rs21, %r32; 201; CHECK-NEXT: add.s16 %rs22, %rs21, 1; 202; CHECK-NEXT: cvt.u32.u16 %r33, %rs22; 203; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8; 204; CHECK-NEXT: cvt.u16.u32 %rs23, %r34; 205; CHECK-NEXT: add.s16 %rs24, %rs23, 1; 206; CHECK-NEXT: cvt.u32.u16 %r35, %rs24; 207; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U; 208; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U; 209; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8; 210; CHECK-NEXT: cvt.u16.u32 %rs25, %r38; 211; CHECK-NEXT: add.s16 %rs26, %rs25, 1; 212; CHECK-NEXT: cvt.u32.u16 %r39, %rs26; 213; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8; 214; CHECK-NEXT: cvt.u16.u32 %rs27, %r40; 215; CHECK-NEXT: add.s16 %rs28, %rs27, 1; 216; CHECK-NEXT: cvt.u32.u16 %r41, %rs28; 217; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U; 218; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8; 219; CHECK-NEXT: cvt.u16.u32 %rs29, %r43; 220; CHECK-NEXT: add.s16 %rs30, %rs29, 1; 221; CHECK-NEXT: cvt.u32.u16 %r44, %rs30; 222; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8; 223; CHECK-NEXT: cvt.u16.u32 %rs31, %r45; 224; CHECK-NEXT: add.s16 %rs32, %rs31, 1; 225; CHECK-NEXT: cvt.u32.u16 %r46, %rs32; 226; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U; 227; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U; 228; CHECK-NEXT: st.v4.b32 [%rd1], {%r48, %r37, %r26, %r15}; 229; CHECK-NEXT: ret; 230 %a.load = load <16 x i8>, ptr %a 231 %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 232 store <16 x i8> %a.add, ptr %a 233 ret void 234} 235 236define void @generic_2xi16(ptr %a) { 237; CHECK-LABEL: generic_2xi16( 238; CHECK: { 239; CHECK-NEXT: .reg .b16 %rs<5>; 240; CHECK-NEXT: .reg .b32 %r<3>; 241; CHECK-NEXT: .reg .b64 %rd<2>; 242; CHECK-EMPTY: 243; CHECK-NEXT: // %bb.0: 244; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xi16_param_0]; 245; CHECK-NEXT: ld.u32 %r1, [%rd1]; 246; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; 247; CHECK-NEXT: add.s16 %rs3, %rs2, 1; 248; CHECK-NEXT: add.s16 %rs4, %rs1, 1; 249; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; 250; CHECK-NEXT: st.u32 [%rd1], %r2; 251; CHECK-NEXT: ret; 252 %a.load = load <2 x i16>, ptr %a 253 %a.add = add <2 x i16> %a.load, <i16 1, i16 1> 254 store <2 x i16> %a.add, ptr %a 255 ret void 256} 257 258define void @generic_4xi16(ptr %a) { 259; CHECK-LABEL: generic_4xi16( 260; CHECK: { 261; CHECK-NEXT: .reg .b16 %rs<9>; 262; CHECK-NEXT: .reg .b64 %rd<2>; 263; CHECK-EMPTY: 264; CHECK-NEXT: // %bb.0: 265; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xi16_param_0]; 266; CHECK-NEXT: ld.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; 267; CHECK-NEXT: add.s16 %rs5, %rs4, 1; 268; CHECK-NEXT: add.s16 %rs6, %rs3, 1; 269; CHECK-NEXT: add.s16 %rs7, %rs2, 1; 270; CHECK-NEXT: add.s16 %rs8, %rs1, 1; 271; CHECK-NEXT: st.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; 272; CHECK-NEXT: ret; 273 %a.load = load <4 x i16>, ptr %a 274 %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1> 275 store <4 x i16> %a.add, ptr %a 276 ret void 277} 278 279define void @generic_8xi16(ptr %a) { 280; CHECK-LABEL: generic_8xi16( 281; CHECK: { 282; CHECK-NEXT: .reg .b16 %rs<17>; 283; CHECK-NEXT: .reg .b32 %r<9>; 284; CHECK-NEXT: .reg .b64 %rd<2>; 285; CHECK-EMPTY: 286; CHECK-NEXT: // %bb.0: 287; CHECK-NEXT: ld.param.u64 %rd1, [generic_8xi16_param_0]; 288; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; 289; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; 290; CHECK-NEXT: add.s16 %rs3, %rs2, 1; 291; CHECK-NEXT: add.s16 %rs4, %rs1, 1; 292; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; 293; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3; 294; CHECK-NEXT: add.s16 %rs7, %rs6, 1; 295; CHECK-NEXT: add.s16 %rs8, %rs5, 1; 296; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7}; 297; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2; 298; CHECK-NEXT: add.s16 %rs11, %rs10, 1; 299; CHECK-NEXT: add.s16 %rs12, %rs9, 1; 300; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11}; 301; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1; 302; CHECK-NEXT: add.s16 %rs15, %rs14, 1; 303; CHECK-NEXT: add.s16 %rs16, %rs13, 1; 304; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15}; 305; CHECK-NEXT: st.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; 306; CHECK-NEXT: ret; 307 %a.load = load <8 x i16>, ptr %a 308 %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 309 store <8 x i16> %a.add, ptr %a 310 ret void 311} 312 313define void @generic_2xi32(ptr %a) { 314; CHECK-LABEL: generic_2xi32( 315; CHECK: { 316; CHECK-NEXT: .reg .b32 %r<5>; 317; CHECK-NEXT: .reg .b64 %rd<2>; 318; CHECK-EMPTY: 319; CHECK-NEXT: // %bb.0: 320; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xi32_param_0]; 321; CHECK-NEXT: ld.v2.u32 {%r1, %r2}, [%rd1]; 322; CHECK-NEXT: add.s32 %r3, %r2, 1; 323; CHECK-NEXT: add.s32 %r4, %r1, 1; 324; CHECK-NEXT: st.v2.u32 [%rd1], {%r4, %r3}; 325; CHECK-NEXT: ret; 326 %a.load = load <2 x i32>, ptr %a 327 %a.add = add <2 x i32> %a.load, <i32 1, i32 1> 328 store <2 x i32> %a.add, ptr %a 329 ret void 330} 331 332define void @generic_4xi32(ptr %a) { 333; CHECK-LABEL: generic_4xi32( 334; CHECK: { 335; CHECK-NEXT: .reg .b32 %r<9>; 336; CHECK-NEXT: .reg .b64 %rd<2>; 337; CHECK-EMPTY: 338; CHECK-NEXT: // %bb.0: 339; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xi32_param_0]; 340; CHECK-NEXT: ld.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; 341; CHECK-NEXT: add.s32 %r5, %r4, 1; 342; CHECK-NEXT: add.s32 %r6, %r3, 1; 343; CHECK-NEXT: add.s32 %r7, %r2, 1; 344; CHECK-NEXT: add.s32 %r8, %r1, 1; 345; CHECK-NEXT: st.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; 346; CHECK-NEXT: ret; 347 %a.load = load <4 x i32>, ptr %a 348 %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1> 349 store <4 x i32> %a.add, ptr %a 350 ret void 351} 352 353define void @generic_2xi64(ptr %a) { 354; CHECK-LABEL: generic_2xi64( 355; CHECK: { 356; CHECK-NEXT: .reg .b64 %rd<6>; 357; CHECK-EMPTY: 358; CHECK-NEXT: // %bb.0: 359; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xi64_param_0]; 360; CHECK-NEXT: ld.v2.u64 {%rd2, %rd3}, [%rd1]; 361; CHECK-NEXT: add.s64 %rd4, %rd3, 1; 362; CHECK-NEXT: add.s64 %rd5, %rd2, 1; 363; CHECK-NEXT: st.v2.u64 [%rd1], {%rd5, %rd4}; 364; CHECK-NEXT: ret; 365 %a.load = load <2 x i64>, ptr %a 366 %a.add = add <2 x i64> %a.load, <i64 1, i64 1> 367 store <2 x i64> %a.add, ptr %a 368 ret void 369} 370 371define void @generic_2xfloat(ptr %a) { 372; CHECK-LABEL: generic_2xfloat( 373; CHECK: { 374; CHECK-NEXT: .reg .f32 %f<5>; 375; CHECK-NEXT: .reg .b64 %rd<2>; 376; CHECK-EMPTY: 377; CHECK-NEXT: // %bb.0: 378; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xfloat_param_0]; 379; CHECK-NEXT: ld.v2.f32 {%f1, %f2}, [%rd1]; 380; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; 381; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; 382; CHECK-NEXT: st.v2.f32 [%rd1], {%f4, %f3}; 383; CHECK-NEXT: ret; 384 %a.load = load <2 x float>, ptr %a 385 %a.add = fadd <2 x float> %a.load, <float 1., float 1.> 386 store <2 x float> %a.add, ptr %a 387 ret void 388} 389 390define void @generic_4xfloat(ptr %a) { 391; CHECK-LABEL: generic_4xfloat( 392; CHECK: { 393; CHECK-NEXT: .reg .f32 %f<9>; 394; CHECK-NEXT: .reg .b64 %rd<2>; 395; CHECK-EMPTY: 396; CHECK-NEXT: // %bb.0: 397; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xfloat_param_0]; 398; CHECK-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; 399; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; 400; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; 401; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; 402; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; 403; CHECK-NEXT: st.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; 404; CHECK-NEXT: ret; 405 %a.load = load <4 x float>, ptr %a 406 %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.> 407 store <4 x float> %a.add, ptr %a 408 ret void 409} 410 411define void @generic_2xdouble(ptr %a) { 412; CHECK-LABEL: generic_2xdouble( 413; CHECK: { 414; CHECK-NEXT: .reg .b64 %rd<2>; 415; CHECK-NEXT: .reg .f64 %fd<5>; 416; CHECK-EMPTY: 417; CHECK-NEXT: // %bb.0: 418; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xdouble_param_0]; 419; CHECK-NEXT: ld.v2.f64 {%fd1, %fd2}, [%rd1]; 420; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; 421; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; 422; CHECK-NEXT: st.v2.f64 [%rd1], {%fd4, %fd3}; 423; CHECK-NEXT: ret; 424 %a.load = load <2 x double>, ptr %a 425 %a.add = fadd <2 x double> %a.load, <double 1., double 1.> 426 store <2 x double> %a.add, ptr %a 427 ret void 428} 429 430; generic_volatile 431 432; TODO: volatile, atomic, and volatile atomic memory operations on vector types. 433; Currently, LLVM: 434; - does not allow atomic operations on vectors. 435; - it allows volatile operations but not clear what that means. 436; Following both semantics make sense in general and PTX supports both: 437; - volatile/atomic/volatile atomic applies to the whole vector 438; - volatile/atomic/volatile atomic applies elementwise 439; Actions required: 440; - clarify LLVM semantics for volatile on vectors and align the NVPTX backend with those 441; Below tests show that the current implementation picks the semantics in an inconsistent way 442; * volatile <2 x i8> lowers to "elementwise volatile" 443; * <4 x i8> lowers to "full vector volatile" 444; - provide support for vector atomics, e.g., by extending LLVM IR or via intrinsics 445; - update tests in load-store-sm70.ll as well. 446 447; TODO: make this operation consistent with the one for <4 x i8> 448; This operation lowers to a "element wise volatile PTX operation". 449define void @generic_volatile_2xi8(ptr %a) { 450; CHECK-LABEL: generic_volatile_2xi8( 451; CHECK: { 452; CHECK-NEXT: .reg .b16 %rs<5>; 453; CHECK-NEXT: .reg .b64 %rd<2>; 454; CHECK-EMPTY: 455; CHECK-NEXT: // %bb.0: 456; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xi8_param_0]; 457; CHECK-NEXT: ld.volatile.v2.u8 {%rs1, %rs2}, [%rd1]; 458; CHECK-NEXT: add.s16 %rs3, %rs2, 1; 459; CHECK-NEXT: add.s16 %rs4, %rs1, 1; 460; CHECK-NEXT: st.volatile.v2.u8 [%rd1], {%rs4, %rs3}; 461; CHECK-NEXT: ret; 462 %a.load = load volatile <2 x i8>, ptr %a 463 %a.add = add <2 x i8> %a.load, <i8 1, i8 1> 464 store volatile <2 x i8> %a.add, ptr %a 465 ret void 466} 467 468; TODO: make this operation consistent with the one for <2 x i8> 469; This operation lowers to a "full vector volatile PTX operation". 470define void @generic_volatile_4xi8(ptr %a) { 471; CHECK-LABEL: generic_volatile_4xi8( 472; CHECK: { 473; CHECK-NEXT: .reg .b16 %rs<9>; 474; CHECK-NEXT: .reg .b32 %r<13>; 475; CHECK-NEXT: .reg .b64 %rd<2>; 476; CHECK-EMPTY: 477; CHECK-NEXT: // %bb.0: 478; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xi8_param_0]; 479; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1]; 480; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; 481; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; 482; CHECK-NEXT: add.s16 %rs2, %rs1, 1; 483; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; 484; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; 485; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; 486; CHECK-NEXT: add.s16 %rs4, %rs3, 1; 487; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; 488; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; 489; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; 490; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; 491; CHECK-NEXT: add.s16 %rs6, %rs5, 1; 492; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; 493; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; 494; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; 495; CHECK-NEXT: add.s16 %rs8, %rs7, 1; 496; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; 497; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; 498; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; 499; CHECK-NEXT: st.volatile.u32 [%rd1], %r12; 500; CHECK-NEXT: ret; 501 %a.load = load volatile <4 x i8>, ptr %a 502 %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1> 503 store volatile <4 x i8> %a.add, ptr %a 504 ret void 505} 506 507define void @generic_volatile_8xi8(ptr %a) { 508; CHECK-LABEL: generic_volatile_8xi8( 509; CHECK: { 510; CHECK-NEXT: .reg .b16 %rs<17>; 511; CHECK-NEXT: .reg .b32 %r<25>; 512; CHECK-NEXT: .reg .b64 %rd<2>; 513; CHECK-EMPTY: 514; CHECK-NEXT: // %bb.0: 515; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_8xi8_param_0]; 516; CHECK-NEXT: ld.volatile.v2.b32 {%r1, %r2}, [%rd1]; 517; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; 518; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; 519; CHECK-NEXT: add.s16 %rs2, %rs1, 1; 520; CHECK-NEXT: cvt.u32.u16 %r4, %rs2; 521; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; 522; CHECK-NEXT: cvt.u16.u32 %rs3, %r5; 523; CHECK-NEXT: add.s16 %rs4, %rs3, 1; 524; CHECK-NEXT: cvt.u32.u16 %r6, %rs4; 525; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U; 526; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; 527; CHECK-NEXT: cvt.u16.u32 %rs5, %r8; 528; CHECK-NEXT: add.s16 %rs6, %rs5, 1; 529; CHECK-NEXT: cvt.u32.u16 %r9, %rs6; 530; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; 531; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; 532; CHECK-NEXT: add.s16 %rs8, %rs7, 1; 533; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; 534; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U; 535; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U; 536; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; 537; CHECK-NEXT: cvt.u16.u32 %rs9, %r14; 538; CHECK-NEXT: add.s16 %rs10, %rs9, 1; 539; CHECK-NEXT: cvt.u32.u16 %r15, %rs10; 540; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8; 541; CHECK-NEXT: cvt.u16.u32 %rs11, %r16; 542; CHECK-NEXT: add.s16 %rs12, %rs11, 1; 543; CHECK-NEXT: cvt.u32.u16 %r17, %rs12; 544; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U; 545; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8; 546; CHECK-NEXT: cvt.u16.u32 %rs13, %r19; 547; CHECK-NEXT: add.s16 %rs14, %rs13, 1; 548; CHECK-NEXT: cvt.u32.u16 %r20, %rs14; 549; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8; 550; CHECK-NEXT: cvt.u16.u32 %rs15, %r21; 551; CHECK-NEXT: add.s16 %rs16, %rs15, 1; 552; CHECK-NEXT: cvt.u32.u16 %r22, %rs16; 553; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U; 554; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U; 555; CHECK-NEXT: st.volatile.v2.b32 [%rd1], {%r24, %r13}; 556; CHECK-NEXT: ret; 557 %a.load = load volatile <8 x i8>, ptr %a 558 %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 559 store volatile <8 x i8> %a.add, ptr %a 560 ret void 561} 562 563define void @generic_volatile_16xi8(ptr %a) { 564; CHECK-LABEL: generic_volatile_16xi8( 565; CHECK: { 566; CHECK-NEXT: .reg .b16 %rs<33>; 567; CHECK-NEXT: .reg .b32 %r<49>; 568; CHECK-NEXT: .reg .b64 %rd<2>; 569; CHECK-EMPTY: 570; CHECK-NEXT: // %bb.0: 571; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_16xi8_param_0]; 572; CHECK-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; 573; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; 574; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; 575; CHECK-NEXT: add.s16 %rs2, %rs1, 1; 576; CHECK-NEXT: cvt.u32.u16 %r6, %rs2; 577; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8; 578; CHECK-NEXT: cvt.u16.u32 %rs3, %r7; 579; CHECK-NEXT: add.s16 %rs4, %rs3, 1; 580; CHECK-NEXT: cvt.u32.u16 %r8, %rs4; 581; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U; 582; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8; 583; CHECK-NEXT: cvt.u16.u32 %rs5, %r10; 584; CHECK-NEXT: add.s16 %rs6, %rs5, 1; 585; CHECK-NEXT: cvt.u32.u16 %r11, %rs6; 586; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8; 587; CHECK-NEXT: cvt.u16.u32 %rs7, %r12; 588; CHECK-NEXT: add.s16 %rs8, %rs7, 1; 589; CHECK-NEXT: cvt.u32.u16 %r13, %rs8; 590; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U; 591; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U; 592; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8; 593; CHECK-NEXT: cvt.u16.u32 %rs9, %r16; 594; CHECK-NEXT: add.s16 %rs10, %rs9, 1; 595; CHECK-NEXT: cvt.u32.u16 %r17, %rs10; 596; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8; 597; CHECK-NEXT: cvt.u16.u32 %rs11, %r18; 598; CHECK-NEXT: add.s16 %rs12, %rs11, 1; 599; CHECK-NEXT: cvt.u32.u16 %r19, %rs12; 600; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U; 601; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8; 602; CHECK-NEXT: cvt.u16.u32 %rs13, %r21; 603; CHECK-NEXT: add.s16 %rs14, %rs13, 1; 604; CHECK-NEXT: cvt.u32.u16 %r22, %rs14; 605; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8; 606; CHECK-NEXT: cvt.u16.u32 %rs15, %r23; 607; CHECK-NEXT: add.s16 %rs16, %rs15, 1; 608; CHECK-NEXT: cvt.u32.u16 %r24, %rs16; 609; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U; 610; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U; 611; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8; 612; CHECK-NEXT: cvt.u16.u32 %rs17, %r27; 613; CHECK-NEXT: add.s16 %rs18, %rs17, 1; 614; CHECK-NEXT: cvt.u32.u16 %r28, %rs18; 615; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8; 616; CHECK-NEXT: cvt.u16.u32 %rs19, %r29; 617; CHECK-NEXT: add.s16 %rs20, %rs19, 1; 618; CHECK-NEXT: cvt.u32.u16 %r30, %rs20; 619; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U; 620; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8; 621; CHECK-NEXT: cvt.u16.u32 %rs21, %r32; 622; CHECK-NEXT: add.s16 %rs22, %rs21, 1; 623; CHECK-NEXT: cvt.u32.u16 %r33, %rs22; 624; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8; 625; CHECK-NEXT: cvt.u16.u32 %rs23, %r34; 626; CHECK-NEXT: add.s16 %rs24, %rs23, 1; 627; CHECK-NEXT: cvt.u32.u16 %r35, %rs24; 628; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U; 629; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U; 630; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8; 631; CHECK-NEXT: cvt.u16.u32 %rs25, %r38; 632; CHECK-NEXT: add.s16 %rs26, %rs25, 1; 633; CHECK-NEXT: cvt.u32.u16 %r39, %rs26; 634; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8; 635; CHECK-NEXT: cvt.u16.u32 %rs27, %r40; 636; CHECK-NEXT: add.s16 %rs28, %rs27, 1; 637; CHECK-NEXT: cvt.u32.u16 %r41, %rs28; 638; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U; 639; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8; 640; CHECK-NEXT: cvt.u16.u32 %rs29, %r43; 641; CHECK-NEXT: add.s16 %rs30, %rs29, 1; 642; CHECK-NEXT: cvt.u32.u16 %r44, %rs30; 643; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8; 644; CHECK-NEXT: cvt.u16.u32 %rs31, %r45; 645; CHECK-NEXT: add.s16 %rs32, %rs31, 1; 646; CHECK-NEXT: cvt.u32.u16 %r46, %rs32; 647; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U; 648; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U; 649; CHECK-NEXT: st.volatile.v4.b32 [%rd1], {%r48, %r37, %r26, %r15}; 650; CHECK-NEXT: ret; 651 %a.load = load volatile <16 x i8>, ptr %a 652 %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 653 store volatile <16 x i8> %a.add, ptr %a 654 ret void 655} 656 657define void @generic_volatile_2xi16(ptr %a) { 658; CHECK-LABEL: generic_volatile_2xi16( 659; CHECK: { 660; CHECK-NEXT: .reg .b16 %rs<5>; 661; CHECK-NEXT: .reg .b32 %r<3>; 662; CHECK-NEXT: .reg .b64 %rd<2>; 663; CHECK-EMPTY: 664; CHECK-NEXT: // %bb.0: 665; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xi16_param_0]; 666; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1]; 667; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; 668; CHECK-NEXT: add.s16 %rs3, %rs2, 1; 669; CHECK-NEXT: add.s16 %rs4, %rs1, 1; 670; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; 671; CHECK-NEXT: st.volatile.u32 [%rd1], %r2; 672; CHECK-NEXT: ret; 673 %a.load = load volatile <2 x i16>, ptr %a 674 %a.add = add <2 x i16> %a.load, <i16 1, i16 1> 675 store volatile <2 x i16> %a.add, ptr %a 676 ret void 677} 678 679define void @generic_volatile_4xi16(ptr %a) { 680; CHECK-LABEL: generic_volatile_4xi16( 681; CHECK: { 682; CHECK-NEXT: .reg .b16 %rs<9>; 683; CHECK-NEXT: .reg .b64 %rd<2>; 684; CHECK-EMPTY: 685; CHECK-NEXT: // %bb.0: 686; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xi16_param_0]; 687; CHECK-NEXT: ld.volatile.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; 688; CHECK-NEXT: add.s16 %rs5, %rs4, 1; 689; CHECK-NEXT: add.s16 %rs6, %rs3, 1; 690; CHECK-NEXT: add.s16 %rs7, %rs2, 1; 691; CHECK-NEXT: add.s16 %rs8, %rs1, 1; 692; CHECK-NEXT: st.volatile.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; 693; CHECK-NEXT: ret; 694 %a.load = load volatile <4 x i16>, ptr %a 695 %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1> 696 store volatile <4 x i16> %a.add, ptr %a 697 ret void 698} 699 700define void @generic_volatile_8xi16(ptr %a) { 701; CHECK-LABEL: generic_volatile_8xi16( 702; CHECK: { 703; CHECK-NEXT: .reg .b16 %rs<17>; 704; CHECK-NEXT: .reg .b32 %r<9>; 705; CHECK-NEXT: .reg .b64 %rd<2>; 706; CHECK-EMPTY: 707; CHECK-NEXT: // %bb.0: 708; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_8xi16_param_0]; 709; CHECK-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; 710; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; 711; CHECK-NEXT: add.s16 %rs3, %rs2, 1; 712; CHECK-NEXT: add.s16 %rs4, %rs1, 1; 713; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; 714; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3; 715; CHECK-NEXT: add.s16 %rs7, %rs6, 1; 716; CHECK-NEXT: add.s16 %rs8, %rs5, 1; 717; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7}; 718; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2; 719; CHECK-NEXT: add.s16 %rs11, %rs10, 1; 720; CHECK-NEXT: add.s16 %rs12, %rs9, 1; 721; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11}; 722; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1; 723; CHECK-NEXT: add.s16 %rs15, %rs14, 1; 724; CHECK-NEXT: add.s16 %rs16, %rs13, 1; 725; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15}; 726; CHECK-NEXT: st.volatile.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; 727; CHECK-NEXT: ret; 728 %a.load = load volatile <8 x i16>, ptr %a 729 %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 730 store volatile <8 x i16> %a.add, ptr %a 731 ret void 732} 733 734define void @generic_volatile_2xi32(ptr %a) { 735; CHECK-LABEL: generic_volatile_2xi32( 736; CHECK: { 737; CHECK-NEXT: .reg .b32 %r<5>; 738; CHECK-NEXT: .reg .b64 %rd<2>; 739; CHECK-EMPTY: 740; CHECK-NEXT: // %bb.0: 741; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xi32_param_0]; 742; CHECK-NEXT: ld.volatile.v2.u32 {%r1, %r2}, [%rd1]; 743; CHECK-NEXT: add.s32 %r3, %r2, 1; 744; CHECK-NEXT: add.s32 %r4, %r1, 1; 745; CHECK-NEXT: st.volatile.v2.u32 [%rd1], {%r4, %r3}; 746; CHECK-NEXT: ret; 747 %a.load = load volatile <2 x i32>, ptr %a 748 %a.add = add <2 x i32> %a.load, <i32 1, i32 1> 749 store volatile <2 x i32> %a.add, ptr %a 750 ret void 751} 752 753define void @generic_volatile_4xi32(ptr %a) { 754; CHECK-LABEL: generic_volatile_4xi32( 755; CHECK: { 756; CHECK-NEXT: .reg .b32 %r<9>; 757; CHECK-NEXT: .reg .b64 %rd<2>; 758; CHECK-EMPTY: 759; CHECK-NEXT: // %bb.0: 760; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xi32_param_0]; 761; CHECK-NEXT: ld.volatile.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; 762; CHECK-NEXT: add.s32 %r5, %r4, 1; 763; CHECK-NEXT: add.s32 %r6, %r3, 1; 764; CHECK-NEXT: add.s32 %r7, %r2, 1; 765; CHECK-NEXT: add.s32 %r8, %r1, 1; 766; CHECK-NEXT: st.volatile.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; 767; CHECK-NEXT: ret; 768 %a.load = load volatile <4 x i32>, ptr %a 769 %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1> 770 store volatile <4 x i32> %a.add, ptr %a 771 ret void 772} 773 774define void @generic_volatile_2xi64(ptr %a) { 775; CHECK-LABEL: generic_volatile_2xi64( 776; CHECK: { 777; CHECK-NEXT: .reg .b64 %rd<6>; 778; CHECK-EMPTY: 779; CHECK-NEXT: // %bb.0: 780; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xi64_param_0]; 781; CHECK-NEXT: ld.volatile.v2.u64 {%rd2, %rd3}, [%rd1]; 782; CHECK-NEXT: add.s64 %rd4, %rd3, 1; 783; CHECK-NEXT: add.s64 %rd5, %rd2, 1; 784; CHECK-NEXT: st.volatile.v2.u64 [%rd1], {%rd5, %rd4}; 785; CHECK-NEXT: ret; 786 %a.load = load volatile <2 x i64>, ptr %a 787 %a.add = add <2 x i64> %a.load, <i64 1, i64 1> 788 store volatile <2 x i64> %a.add, ptr %a 789 ret void 790} 791 792define void @generic_volatile_2xfloat(ptr %a) { 793; CHECK-LABEL: generic_volatile_2xfloat( 794; CHECK: { 795; CHECK-NEXT: .reg .f32 %f<5>; 796; CHECK-NEXT: .reg .b64 %rd<2>; 797; CHECK-EMPTY: 798; CHECK-NEXT: // %bb.0: 799; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xfloat_param_0]; 800; CHECK-NEXT: ld.volatile.v2.f32 {%f1, %f2}, [%rd1]; 801; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; 802; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; 803; CHECK-NEXT: st.volatile.v2.f32 [%rd1], {%f4, %f3}; 804; CHECK-NEXT: ret; 805 %a.load = load volatile <2 x float>, ptr %a 806 %a.add = fadd <2 x float> %a.load, <float 1., float 1.> 807 store volatile <2 x float> %a.add, ptr %a 808 ret void 809} 810 811define void @generic_volatile_4xfloat(ptr %a) { 812; CHECK-LABEL: generic_volatile_4xfloat( 813; CHECK: { 814; CHECK-NEXT: .reg .f32 %f<9>; 815; CHECK-NEXT: .reg .b64 %rd<2>; 816; CHECK-EMPTY: 817; CHECK-NEXT: // %bb.0: 818; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xfloat_param_0]; 819; CHECK-NEXT: ld.volatile.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; 820; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; 821; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; 822; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; 823; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; 824; CHECK-NEXT: st.volatile.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; 825; CHECK-NEXT: ret; 826 %a.load = load volatile <4 x float>, ptr %a 827 %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.> 828 store volatile <4 x float> %a.add, ptr %a 829 ret void 830} 831 832define void @generic_volatile_2xdouble(ptr %a) { 833; CHECK-LABEL: generic_volatile_2xdouble( 834; CHECK: { 835; CHECK-NEXT: .reg .b64 %rd<2>; 836; CHECK-NEXT: .reg .f64 %fd<5>; 837; CHECK-EMPTY: 838; CHECK-NEXT: // %bb.0: 839; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xdouble_param_0]; 840; CHECK-NEXT: ld.volatile.v2.f64 {%fd1, %fd2}, [%rd1]; 841; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; 842; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; 843; CHECK-NEXT: st.volatile.v2.f64 [%rd1], {%fd4, %fd3}; 844; CHECK-NEXT: ret; 845 %a.load = load volatile <2 x double>, ptr %a 846 %a.add = fadd <2 x double> %a.load, <double 1., double 1.> 847 store volatile <2 x double> %a.add, ptr %a 848 ret void 849} 850 851;; global statespace 852 853; global 854 855define void @global_2xi8(ptr addrspace(1) %a) { 856; CHECK-LABEL: global_2xi8( 857; CHECK: { 858; CHECK-NEXT: .reg .b16 %rs<5>; 859; CHECK-NEXT: .reg .b64 %rd<2>; 860; CHECK-EMPTY: 861; CHECK-NEXT: // %bb.0: 862; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi8_param_0]; 863; CHECK-NEXT: ld.global.v2.u8 {%rs1, %rs2}, [%rd1]; 864; CHECK-NEXT: add.s16 %rs3, %rs2, 1; 865; CHECK-NEXT: add.s16 %rs4, %rs1, 1; 866; CHECK-NEXT: st.global.v2.u8 [%rd1], {%rs4, %rs3}; 867; CHECK-NEXT: ret; 868 %a.load = load <2 x i8>, ptr addrspace(1) %a 869 %a.add = add <2 x i8> %a.load, <i8 1, i8 1> 870 store <2 x i8> %a.add, ptr addrspace(1) %a 871 ret void 872} 873 874define void @global_4xi8(ptr addrspace(1) %a) { 875; CHECK-LABEL: global_4xi8( 876; CHECK: { 877; CHECK-NEXT: .reg .b16 %rs<9>; 878; CHECK-NEXT: .reg .b32 %r<13>; 879; CHECK-NEXT: .reg .b64 %rd<2>; 880; CHECK-EMPTY: 881; CHECK-NEXT: // %bb.0: 882; CHECK-NEXT: ld.param.u64 %rd1, [global_4xi8_param_0]; 883; CHECK-NEXT: ld.global.u32 %r1, [%rd1]; 884; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; 885; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; 886; CHECK-NEXT: add.s16 %rs2, %rs1, 1; 887; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; 888; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; 889; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; 890; CHECK-NEXT: add.s16 %rs4, %rs3, 1; 891; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; 892; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; 893; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; 894; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; 895; CHECK-NEXT: add.s16 %rs6, %rs5, 1; 896; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; 897; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; 898; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; 899; CHECK-NEXT: add.s16 %rs8, %rs7, 1; 900; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; 901; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; 902; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; 903; CHECK-NEXT: st.global.u32 [%rd1], %r12; 904; CHECK-NEXT: ret; 905 %a.load = load <4 x i8>, ptr addrspace(1) %a 906 %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1> 907 store <4 x i8> %a.add, ptr addrspace(1) %a 908 ret void 909} 910 911define void @global_8xi8(ptr addrspace(1) %a) { 912; CHECK-LABEL: global_8xi8( 913; CHECK: { 914; CHECK-NEXT: .reg .b16 %rs<17>; 915; CHECK-NEXT: .reg .b32 %r<25>; 916; CHECK-NEXT: .reg .b64 %rd<2>; 917; CHECK-EMPTY: 918; CHECK-NEXT: // %bb.0: 919; CHECK-NEXT: ld.param.u64 %rd1, [global_8xi8_param_0]; 920; CHECK-NEXT: ld.global.v2.b32 {%r1, %r2}, [%rd1]; 921; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; 922; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; 923; CHECK-NEXT: add.s16 %rs2, %rs1, 1; 924; CHECK-NEXT: cvt.u32.u16 %r4, %rs2; 925; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; 926; CHECK-NEXT: cvt.u16.u32 %rs3, %r5; 927; CHECK-NEXT: add.s16 %rs4, %rs3, 1; 928; CHECK-NEXT: cvt.u32.u16 %r6, %rs4; 929; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U; 930; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; 931; CHECK-NEXT: cvt.u16.u32 %rs5, %r8; 932; CHECK-NEXT: add.s16 %rs6, %rs5, 1; 933; CHECK-NEXT: cvt.u32.u16 %r9, %rs6; 934; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; 935; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; 936; CHECK-NEXT: add.s16 %rs8, %rs7, 1; 937; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; 938; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U; 939; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U; 940; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; 941; CHECK-NEXT: cvt.u16.u32 %rs9, %r14; 942; CHECK-NEXT: add.s16 %rs10, %rs9, 1; 943; CHECK-NEXT: cvt.u32.u16 %r15, %rs10; 944; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8; 945; CHECK-NEXT: cvt.u16.u32 %rs11, %r16; 946; CHECK-NEXT: add.s16 %rs12, %rs11, 1; 947; CHECK-NEXT: cvt.u32.u16 %r17, %rs12; 948; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U; 949; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8; 950; CHECK-NEXT: cvt.u16.u32 %rs13, %r19; 951; CHECK-NEXT: add.s16 %rs14, %rs13, 1; 952; CHECK-NEXT: cvt.u32.u16 %r20, %rs14; 953; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8; 954; CHECK-NEXT: cvt.u16.u32 %rs15, %r21; 955; CHECK-NEXT: add.s16 %rs16, %rs15, 1; 956; CHECK-NEXT: cvt.u32.u16 %r22, %rs16; 957; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U; 958; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U; 959; CHECK-NEXT: st.global.v2.b32 [%rd1], {%r24, %r13}; 960; CHECK-NEXT: ret; 961 %a.load = load <8 x i8>, ptr addrspace(1) %a 962 %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 963 store <8 x i8> %a.add, ptr addrspace(1) %a 964 ret void 965} 966 967define void @global_16xi8(ptr addrspace(1) %a) { 968; CHECK-LABEL: global_16xi8( 969; CHECK: { 970; CHECK-NEXT: .reg .b16 %rs<33>; 971; CHECK-NEXT: .reg .b32 %r<49>; 972; CHECK-NEXT: .reg .b64 %rd<2>; 973; CHECK-EMPTY: 974; CHECK-NEXT: // %bb.0: 975; CHECK-NEXT: ld.param.u64 %rd1, [global_16xi8_param_0]; 976; CHECK-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; 977; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; 978; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; 979; CHECK-NEXT: add.s16 %rs2, %rs1, 1; 980; CHECK-NEXT: cvt.u32.u16 %r6, %rs2; 981; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8; 982; CHECK-NEXT: cvt.u16.u32 %rs3, %r7; 983; CHECK-NEXT: add.s16 %rs4, %rs3, 1; 984; CHECK-NEXT: cvt.u32.u16 %r8, %rs4; 985; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U; 986; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8; 987; CHECK-NEXT: cvt.u16.u32 %rs5, %r10; 988; CHECK-NEXT: add.s16 %rs6, %rs5, 1; 989; CHECK-NEXT: cvt.u32.u16 %r11, %rs6; 990; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8; 991; CHECK-NEXT: cvt.u16.u32 %rs7, %r12; 992; CHECK-NEXT: add.s16 %rs8, %rs7, 1; 993; CHECK-NEXT: cvt.u32.u16 %r13, %rs8; 994; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U; 995; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U; 996; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8; 997; CHECK-NEXT: cvt.u16.u32 %rs9, %r16; 998; CHECK-NEXT: add.s16 %rs10, %rs9, 1; 999; CHECK-NEXT: cvt.u32.u16 %r17, %rs10; 1000; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8; 1001; CHECK-NEXT: cvt.u16.u32 %rs11, %r18; 1002; CHECK-NEXT: add.s16 %rs12, %rs11, 1; 1003; CHECK-NEXT: cvt.u32.u16 %r19, %rs12; 1004; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U; 1005; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8; 1006; CHECK-NEXT: cvt.u16.u32 %rs13, %r21; 1007; CHECK-NEXT: add.s16 %rs14, %rs13, 1; 1008; CHECK-NEXT: cvt.u32.u16 %r22, %rs14; 1009; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8; 1010; CHECK-NEXT: cvt.u16.u32 %rs15, %r23; 1011; CHECK-NEXT: add.s16 %rs16, %rs15, 1; 1012; CHECK-NEXT: cvt.u32.u16 %r24, %rs16; 1013; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U; 1014; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U; 1015; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8; 1016; CHECK-NEXT: cvt.u16.u32 %rs17, %r27; 1017; CHECK-NEXT: add.s16 %rs18, %rs17, 1; 1018; CHECK-NEXT: cvt.u32.u16 %r28, %rs18; 1019; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8; 1020; CHECK-NEXT: cvt.u16.u32 %rs19, %r29; 1021; CHECK-NEXT: add.s16 %rs20, %rs19, 1; 1022; CHECK-NEXT: cvt.u32.u16 %r30, %rs20; 1023; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U; 1024; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8; 1025; CHECK-NEXT: cvt.u16.u32 %rs21, %r32; 1026; CHECK-NEXT: add.s16 %rs22, %rs21, 1; 1027; CHECK-NEXT: cvt.u32.u16 %r33, %rs22; 1028; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8; 1029; CHECK-NEXT: cvt.u16.u32 %rs23, %r34; 1030; CHECK-NEXT: add.s16 %rs24, %rs23, 1; 1031; CHECK-NEXT: cvt.u32.u16 %r35, %rs24; 1032; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U; 1033; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U; 1034; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8; 1035; CHECK-NEXT: cvt.u16.u32 %rs25, %r38; 1036; CHECK-NEXT: add.s16 %rs26, %rs25, 1; 1037; CHECK-NEXT: cvt.u32.u16 %r39, %rs26; 1038; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8; 1039; CHECK-NEXT: cvt.u16.u32 %rs27, %r40; 1040; CHECK-NEXT: add.s16 %rs28, %rs27, 1; 1041; CHECK-NEXT: cvt.u32.u16 %r41, %rs28; 1042; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U; 1043; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8; 1044; CHECK-NEXT: cvt.u16.u32 %rs29, %r43; 1045; CHECK-NEXT: add.s16 %rs30, %rs29, 1; 1046; CHECK-NEXT: cvt.u32.u16 %r44, %rs30; 1047; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8; 1048; CHECK-NEXT: cvt.u16.u32 %rs31, %r45; 1049; CHECK-NEXT: add.s16 %rs32, %rs31, 1; 1050; CHECK-NEXT: cvt.u32.u16 %r46, %rs32; 1051; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U; 1052; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U; 1053; CHECK-NEXT: st.global.v4.b32 [%rd1], {%r48, %r37, %r26, %r15}; 1054; CHECK-NEXT: ret; 1055 %a.load = load <16 x i8>, ptr addrspace(1) %a 1056 %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 1057 store <16 x i8> %a.add, ptr addrspace(1) %a 1058 ret void 1059} 1060 1061define void @global_2xi16(ptr addrspace(1) %a) { 1062; CHECK-LABEL: global_2xi16( 1063; CHECK: { 1064; CHECK-NEXT: .reg .b16 %rs<5>; 1065; CHECK-NEXT: .reg .b32 %r<3>; 1066; CHECK-NEXT: .reg .b64 %rd<2>; 1067; CHECK-EMPTY: 1068; CHECK-NEXT: // %bb.0: 1069; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi16_param_0]; 1070; CHECK-NEXT: ld.global.u32 %r1, [%rd1]; 1071; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; 1072; CHECK-NEXT: add.s16 %rs3, %rs2, 1; 1073; CHECK-NEXT: add.s16 %rs4, %rs1, 1; 1074; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; 1075; CHECK-NEXT: st.global.u32 [%rd1], %r2; 1076; CHECK-NEXT: ret; 1077 %a.load = load <2 x i16>, ptr addrspace(1) %a 1078 %a.add = add <2 x i16> %a.load, <i16 1, i16 1> 1079 store <2 x i16> %a.add, ptr addrspace(1) %a 1080 ret void 1081} 1082 1083define void @global_4xi16(ptr addrspace(1) %a) { 1084; CHECK-LABEL: global_4xi16( 1085; CHECK: { 1086; CHECK-NEXT: .reg .b16 %rs<9>; 1087; CHECK-NEXT: .reg .b64 %rd<2>; 1088; CHECK-EMPTY: 1089; CHECK-NEXT: // %bb.0: 1090; CHECK-NEXT: ld.param.u64 %rd1, [global_4xi16_param_0]; 1091; CHECK-NEXT: ld.global.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; 1092; CHECK-NEXT: add.s16 %rs5, %rs4, 1; 1093; CHECK-NEXT: add.s16 %rs6, %rs3, 1; 1094; CHECK-NEXT: add.s16 %rs7, %rs2, 1; 1095; CHECK-NEXT: add.s16 %rs8, %rs1, 1; 1096; CHECK-NEXT: st.global.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; 1097; CHECK-NEXT: ret; 1098 %a.load = load <4 x i16>, ptr addrspace(1) %a 1099 %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1> 1100 store <4 x i16> %a.add, ptr addrspace(1) %a 1101 ret void 1102} 1103 1104define void @global_8xi16(ptr addrspace(1) %a) { 1105; CHECK-LABEL: global_8xi16( 1106; CHECK: { 1107; CHECK-NEXT: .reg .b16 %rs<17>; 1108; CHECK-NEXT: .reg .b32 %r<9>; 1109; CHECK-NEXT: .reg .b64 %rd<2>; 1110; CHECK-EMPTY: 1111; CHECK-NEXT: // %bb.0: 1112; CHECK-NEXT: ld.param.u64 %rd1, [global_8xi16_param_0]; 1113; CHECK-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; 1114; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; 1115; CHECK-NEXT: add.s16 %rs3, %rs2, 1; 1116; CHECK-NEXT: add.s16 %rs4, %rs1, 1; 1117; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; 1118; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3; 1119; CHECK-NEXT: add.s16 %rs7, %rs6, 1; 1120; CHECK-NEXT: add.s16 %rs8, %rs5, 1; 1121; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7}; 1122; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2; 1123; CHECK-NEXT: add.s16 %rs11, %rs10, 1; 1124; CHECK-NEXT: add.s16 %rs12, %rs9, 1; 1125; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11}; 1126; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1; 1127; CHECK-NEXT: add.s16 %rs15, %rs14, 1; 1128; CHECK-NEXT: add.s16 %rs16, %rs13, 1; 1129; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15}; 1130; CHECK-NEXT: st.global.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; 1131; CHECK-NEXT: ret; 1132 %a.load = load <8 x i16>, ptr addrspace(1) %a 1133 %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1134 store <8 x i16> %a.add, ptr addrspace(1) %a 1135 ret void 1136} 1137 1138define void @global_2xi32(ptr addrspace(1) %a) { 1139; CHECK-LABEL: global_2xi32( 1140; CHECK: { 1141; CHECK-NEXT: .reg .b32 %r<5>; 1142; CHECK-NEXT: .reg .b64 %rd<2>; 1143; CHECK-EMPTY: 1144; CHECK-NEXT: // %bb.0: 1145; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi32_param_0]; 1146; CHECK-NEXT: ld.global.v2.u32 {%r1, %r2}, [%rd1]; 1147; CHECK-NEXT: add.s32 %r3, %r2, 1; 1148; CHECK-NEXT: add.s32 %r4, %r1, 1; 1149; CHECK-NEXT: st.global.v2.u32 [%rd1], {%r4, %r3}; 1150; CHECK-NEXT: ret; 1151 %a.load = load <2 x i32>, ptr addrspace(1) %a 1152 %a.add = add <2 x i32> %a.load, <i32 1, i32 1> 1153 store <2 x i32> %a.add, ptr addrspace(1) %a 1154 ret void 1155} 1156 1157define void @global_4xi32(ptr addrspace(1) %a) { 1158; CHECK-LABEL: global_4xi32( 1159; CHECK: { 1160; CHECK-NEXT: .reg .b32 %r<9>; 1161; CHECK-NEXT: .reg .b64 %rd<2>; 1162; CHECK-EMPTY: 1163; CHECK-NEXT: // %bb.0: 1164; CHECK-NEXT: ld.param.u64 %rd1, [global_4xi32_param_0]; 1165; CHECK-NEXT: ld.global.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; 1166; CHECK-NEXT: add.s32 %r5, %r4, 1; 1167; CHECK-NEXT: add.s32 %r6, %r3, 1; 1168; CHECK-NEXT: add.s32 %r7, %r2, 1; 1169; CHECK-NEXT: add.s32 %r8, %r1, 1; 1170; CHECK-NEXT: st.global.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; 1171; CHECK-NEXT: ret; 1172 %a.load = load <4 x i32>, ptr addrspace(1) %a 1173 %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1> 1174 store <4 x i32> %a.add, ptr addrspace(1) %a 1175 ret void 1176} 1177 1178define void @global_2xi64(ptr addrspace(1) %a) { 1179; CHECK-LABEL: global_2xi64( 1180; CHECK: { 1181; CHECK-NEXT: .reg .b64 %rd<6>; 1182; CHECK-EMPTY: 1183; CHECK-NEXT: // %bb.0: 1184; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi64_param_0]; 1185; CHECK-NEXT: ld.global.v2.u64 {%rd2, %rd3}, [%rd1]; 1186; CHECK-NEXT: add.s64 %rd4, %rd3, 1; 1187; CHECK-NEXT: add.s64 %rd5, %rd2, 1; 1188; CHECK-NEXT: st.global.v2.u64 [%rd1], {%rd5, %rd4}; 1189; CHECK-NEXT: ret; 1190 %a.load = load <2 x i64>, ptr addrspace(1) %a 1191 %a.add = add <2 x i64> %a.load, <i64 1, i64 1> 1192 store <2 x i64> %a.add, ptr addrspace(1) %a 1193 ret void 1194} 1195 1196define void @global_2xfloat(ptr addrspace(1) %a) { 1197; CHECK-LABEL: global_2xfloat( 1198; CHECK: { 1199; CHECK-NEXT: .reg .f32 %f<5>; 1200; CHECK-NEXT: .reg .b64 %rd<2>; 1201; CHECK-EMPTY: 1202; CHECK-NEXT: // %bb.0: 1203; CHECK-NEXT: ld.param.u64 %rd1, [global_2xfloat_param_0]; 1204; CHECK-NEXT: ld.global.v2.f32 {%f1, %f2}, [%rd1]; 1205; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; 1206; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; 1207; CHECK-NEXT: st.global.v2.f32 [%rd1], {%f4, %f3}; 1208; CHECK-NEXT: ret; 1209 %a.load = load <2 x float>, ptr addrspace(1) %a 1210 %a.add = fadd <2 x float> %a.load, <float 1., float 1.> 1211 store <2 x float> %a.add, ptr addrspace(1) %a 1212 ret void 1213} 1214 1215define void @global_4xfloat(ptr addrspace(1) %a) { 1216; CHECK-LABEL: global_4xfloat( 1217; CHECK: { 1218; CHECK-NEXT: .reg .f32 %f<9>; 1219; CHECK-NEXT: .reg .b64 %rd<2>; 1220; CHECK-EMPTY: 1221; CHECK-NEXT: // %bb.0: 1222; CHECK-NEXT: ld.param.u64 %rd1, [global_4xfloat_param_0]; 1223; CHECK-NEXT: ld.global.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; 1224; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; 1225; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; 1226; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; 1227; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; 1228; CHECK-NEXT: st.global.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; 1229; CHECK-NEXT: ret; 1230 %a.load = load <4 x float>, ptr addrspace(1) %a 1231 %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.> 1232 store <4 x float> %a.add, ptr addrspace(1) %a 1233 ret void 1234} 1235 1236define void @global_2xdouble(ptr addrspace(1) %a) { 1237; CHECK-LABEL: global_2xdouble( 1238; CHECK: { 1239; CHECK-NEXT: .reg .b64 %rd<2>; 1240; CHECK-NEXT: .reg .f64 %fd<5>; 1241; CHECK-EMPTY: 1242; CHECK-NEXT: // %bb.0: 1243; CHECK-NEXT: ld.param.u64 %rd1, [global_2xdouble_param_0]; 1244; CHECK-NEXT: ld.global.v2.f64 {%fd1, %fd2}, [%rd1]; 1245; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; 1246; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; 1247; CHECK-NEXT: st.global.v2.f64 [%rd1], {%fd4, %fd3}; 1248; CHECK-NEXT: ret; 1249 %a.load = load <2 x double>, ptr addrspace(1) %a 1250 %a.add = fadd <2 x double> %a.load, <double 1., double 1.> 1251 store <2 x double> %a.add, ptr addrspace(1) %a 1252 ret void 1253} 1254 1255; global_volatile 1256 1257define void @global_volatile_2xi8(ptr addrspace(1) %a) { 1258; CHECK-LABEL: global_volatile_2xi8( 1259; CHECK: { 1260; CHECK-NEXT: .reg .b16 %rs<5>; 1261; CHECK-NEXT: .reg .b64 %rd<2>; 1262; CHECK-EMPTY: 1263; CHECK-NEXT: // %bb.0: 1264; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi8_param_0]; 1265; CHECK-NEXT: ld.volatile.global.v2.u8 {%rs1, %rs2}, [%rd1]; 1266; CHECK-NEXT: add.s16 %rs3, %rs2, 1; 1267; CHECK-NEXT: add.s16 %rs4, %rs1, 1; 1268; CHECK-NEXT: st.volatile.global.v2.u8 [%rd1], {%rs4, %rs3}; 1269; CHECK-NEXT: ret; 1270 %a.load = load volatile <2 x i8>, ptr addrspace(1) %a 1271 %a.add = add <2 x i8> %a.load, <i8 1, i8 1> 1272 store volatile <2 x i8> %a.add, ptr addrspace(1) %a 1273 ret void 1274} 1275 1276define void @global_volatile_4xi8(ptr addrspace(1) %a) { 1277; CHECK-LABEL: global_volatile_4xi8( 1278; CHECK: { 1279; CHECK-NEXT: .reg .b16 %rs<9>; 1280; CHECK-NEXT: .reg .b32 %r<13>; 1281; CHECK-NEXT: .reg .b64 %rd<2>; 1282; CHECK-EMPTY: 1283; CHECK-NEXT: // %bb.0: 1284; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xi8_param_0]; 1285; CHECK-NEXT: ld.volatile.global.u32 %r1, [%rd1]; 1286; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; 1287; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; 1288; CHECK-NEXT: add.s16 %rs2, %rs1, 1; 1289; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; 1290; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; 1291; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; 1292; CHECK-NEXT: add.s16 %rs4, %rs3, 1; 1293; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; 1294; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; 1295; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; 1296; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; 1297; CHECK-NEXT: add.s16 %rs6, %rs5, 1; 1298; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; 1299; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; 1300; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; 1301; CHECK-NEXT: add.s16 %rs8, %rs7, 1; 1302; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; 1303; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; 1304; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; 1305; CHECK-NEXT: st.volatile.global.u32 [%rd1], %r12; 1306; CHECK-NEXT: ret; 1307 %a.load = load volatile <4 x i8>, ptr addrspace(1) %a 1308 %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1> 1309 store volatile <4 x i8> %a.add, ptr addrspace(1) %a 1310 ret void 1311} 1312 1313define void @global_volatile_8xi8(ptr addrspace(1) %a) { 1314; CHECK-LABEL: global_volatile_8xi8( 1315; CHECK: { 1316; CHECK-NEXT: .reg .b16 %rs<17>; 1317; CHECK-NEXT: .reg .b32 %r<25>; 1318; CHECK-NEXT: .reg .b64 %rd<2>; 1319; CHECK-EMPTY: 1320; CHECK-NEXT: // %bb.0: 1321; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_8xi8_param_0]; 1322; CHECK-NEXT: ld.volatile.global.v2.b32 {%r1, %r2}, [%rd1]; 1323; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; 1324; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; 1325; CHECK-NEXT: add.s16 %rs2, %rs1, 1; 1326; CHECK-NEXT: cvt.u32.u16 %r4, %rs2; 1327; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; 1328; CHECK-NEXT: cvt.u16.u32 %rs3, %r5; 1329; CHECK-NEXT: add.s16 %rs4, %rs3, 1; 1330; CHECK-NEXT: cvt.u32.u16 %r6, %rs4; 1331; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U; 1332; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; 1333; CHECK-NEXT: cvt.u16.u32 %rs5, %r8; 1334; CHECK-NEXT: add.s16 %rs6, %rs5, 1; 1335; CHECK-NEXT: cvt.u32.u16 %r9, %rs6; 1336; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; 1337; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; 1338; CHECK-NEXT: add.s16 %rs8, %rs7, 1; 1339; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; 1340; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U; 1341; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U; 1342; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; 1343; CHECK-NEXT: cvt.u16.u32 %rs9, %r14; 1344; CHECK-NEXT: add.s16 %rs10, %rs9, 1; 1345; CHECK-NEXT: cvt.u32.u16 %r15, %rs10; 1346; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8; 1347; CHECK-NEXT: cvt.u16.u32 %rs11, %r16; 1348; CHECK-NEXT: add.s16 %rs12, %rs11, 1; 1349; CHECK-NEXT: cvt.u32.u16 %r17, %rs12; 1350; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U; 1351; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8; 1352; CHECK-NEXT: cvt.u16.u32 %rs13, %r19; 1353; CHECK-NEXT: add.s16 %rs14, %rs13, 1; 1354; CHECK-NEXT: cvt.u32.u16 %r20, %rs14; 1355; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8; 1356; CHECK-NEXT: cvt.u16.u32 %rs15, %r21; 1357; CHECK-NEXT: add.s16 %rs16, %rs15, 1; 1358; CHECK-NEXT: cvt.u32.u16 %r22, %rs16; 1359; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U; 1360; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U; 1361; CHECK-NEXT: st.volatile.global.v2.b32 [%rd1], {%r24, %r13}; 1362; CHECK-NEXT: ret; 1363 %a.load = load volatile <8 x i8>, ptr addrspace(1) %a 1364 %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 1365 store volatile <8 x i8> %a.add, ptr addrspace(1) %a 1366 ret void 1367} 1368 1369define void @global_volatile_16xi8(ptr addrspace(1) %a) { 1370; CHECK-LABEL: global_volatile_16xi8( 1371; CHECK: { 1372; CHECK-NEXT: .reg .b16 %rs<33>; 1373; CHECK-NEXT: .reg .b32 %r<49>; 1374; CHECK-NEXT: .reg .b64 %rd<2>; 1375; CHECK-EMPTY: 1376; CHECK-NEXT: // %bb.0: 1377; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_16xi8_param_0]; 1378; CHECK-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; 1379; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; 1380; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; 1381; CHECK-NEXT: add.s16 %rs2, %rs1, 1; 1382; CHECK-NEXT: cvt.u32.u16 %r6, %rs2; 1383; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8; 1384; CHECK-NEXT: cvt.u16.u32 %rs3, %r7; 1385; CHECK-NEXT: add.s16 %rs4, %rs3, 1; 1386; CHECK-NEXT: cvt.u32.u16 %r8, %rs4; 1387; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U; 1388; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8; 1389; CHECK-NEXT: cvt.u16.u32 %rs5, %r10; 1390; CHECK-NEXT: add.s16 %rs6, %rs5, 1; 1391; CHECK-NEXT: cvt.u32.u16 %r11, %rs6; 1392; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8; 1393; CHECK-NEXT: cvt.u16.u32 %rs7, %r12; 1394; CHECK-NEXT: add.s16 %rs8, %rs7, 1; 1395; CHECK-NEXT: cvt.u32.u16 %r13, %rs8; 1396; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U; 1397; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U; 1398; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8; 1399; CHECK-NEXT: cvt.u16.u32 %rs9, %r16; 1400; CHECK-NEXT: add.s16 %rs10, %rs9, 1; 1401; CHECK-NEXT: cvt.u32.u16 %r17, %rs10; 1402; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8; 1403; CHECK-NEXT: cvt.u16.u32 %rs11, %r18; 1404; CHECK-NEXT: add.s16 %rs12, %rs11, 1; 1405; CHECK-NEXT: cvt.u32.u16 %r19, %rs12; 1406; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U; 1407; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8; 1408; CHECK-NEXT: cvt.u16.u32 %rs13, %r21; 1409; CHECK-NEXT: add.s16 %rs14, %rs13, 1; 1410; CHECK-NEXT: cvt.u32.u16 %r22, %rs14; 1411; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8; 1412; CHECK-NEXT: cvt.u16.u32 %rs15, %r23; 1413; CHECK-NEXT: add.s16 %rs16, %rs15, 1; 1414; CHECK-NEXT: cvt.u32.u16 %r24, %rs16; 1415; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U; 1416; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U; 1417; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8; 1418; CHECK-NEXT: cvt.u16.u32 %rs17, %r27; 1419; CHECK-NEXT: add.s16 %rs18, %rs17, 1; 1420; CHECK-NEXT: cvt.u32.u16 %r28, %rs18; 1421; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8; 1422; CHECK-NEXT: cvt.u16.u32 %rs19, %r29; 1423; CHECK-NEXT: add.s16 %rs20, %rs19, 1; 1424; CHECK-NEXT: cvt.u32.u16 %r30, %rs20; 1425; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U; 1426; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8; 1427; CHECK-NEXT: cvt.u16.u32 %rs21, %r32; 1428; CHECK-NEXT: add.s16 %rs22, %rs21, 1; 1429; CHECK-NEXT: cvt.u32.u16 %r33, %rs22; 1430; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8; 1431; CHECK-NEXT: cvt.u16.u32 %rs23, %r34; 1432; CHECK-NEXT: add.s16 %rs24, %rs23, 1; 1433; CHECK-NEXT: cvt.u32.u16 %r35, %rs24; 1434; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U; 1435; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U; 1436; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8; 1437; CHECK-NEXT: cvt.u16.u32 %rs25, %r38; 1438; CHECK-NEXT: add.s16 %rs26, %rs25, 1; 1439; CHECK-NEXT: cvt.u32.u16 %r39, %rs26; 1440; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8; 1441; CHECK-NEXT: cvt.u16.u32 %rs27, %r40; 1442; CHECK-NEXT: add.s16 %rs28, %rs27, 1; 1443; CHECK-NEXT: cvt.u32.u16 %r41, %rs28; 1444; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U; 1445; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8; 1446; CHECK-NEXT: cvt.u16.u32 %rs29, %r43; 1447; CHECK-NEXT: add.s16 %rs30, %rs29, 1; 1448; CHECK-NEXT: cvt.u32.u16 %r44, %rs30; 1449; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8; 1450; CHECK-NEXT: cvt.u16.u32 %rs31, %r45; 1451; CHECK-NEXT: add.s16 %rs32, %rs31, 1; 1452; CHECK-NEXT: cvt.u32.u16 %r46, %rs32; 1453; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U; 1454; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U; 1455; CHECK-NEXT: st.volatile.global.v4.b32 [%rd1], {%r48, %r37, %r26, %r15}; 1456; CHECK-NEXT: ret; 1457 %a.load = load volatile <16 x i8>, ptr addrspace(1) %a 1458 %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 1459 store volatile <16 x i8> %a.add, ptr addrspace(1) %a 1460 ret void 1461} 1462 1463define void @global_volatile_2xi16(ptr addrspace(1) %a) { 1464; CHECK-LABEL: global_volatile_2xi16( 1465; CHECK: { 1466; CHECK-NEXT: .reg .b16 %rs<5>; 1467; CHECK-NEXT: .reg .b32 %r<3>; 1468; CHECK-NEXT: .reg .b64 %rd<2>; 1469; CHECK-EMPTY: 1470; CHECK-NEXT: // %bb.0: 1471; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi16_param_0]; 1472; CHECK-NEXT: ld.volatile.global.u32 %r1, [%rd1]; 1473; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; 1474; CHECK-NEXT: add.s16 %rs3, %rs2, 1; 1475; CHECK-NEXT: add.s16 %rs4, %rs1, 1; 1476; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; 1477; CHECK-NEXT: st.volatile.global.u32 [%rd1], %r2; 1478; CHECK-NEXT: ret; 1479 %a.load = load volatile <2 x i16>, ptr addrspace(1) %a 1480 %a.add = add <2 x i16> %a.load, <i16 1, i16 1> 1481 store volatile <2 x i16> %a.add, ptr addrspace(1) %a 1482 ret void 1483} 1484 1485define void @global_volatile_4xi16(ptr addrspace(1) %a) { 1486; CHECK-LABEL: global_volatile_4xi16( 1487; CHECK: { 1488; CHECK-NEXT: .reg .b16 %rs<9>; 1489; CHECK-NEXT: .reg .b64 %rd<2>; 1490; CHECK-EMPTY: 1491; CHECK-NEXT: // %bb.0: 1492; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xi16_param_0]; 1493; CHECK-NEXT: ld.volatile.global.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; 1494; CHECK-NEXT: add.s16 %rs5, %rs4, 1; 1495; CHECK-NEXT: add.s16 %rs6, %rs3, 1; 1496; CHECK-NEXT: add.s16 %rs7, %rs2, 1; 1497; CHECK-NEXT: add.s16 %rs8, %rs1, 1; 1498; CHECK-NEXT: st.volatile.global.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; 1499; CHECK-NEXT: ret; 1500 %a.load = load volatile <4 x i16>, ptr addrspace(1) %a 1501 %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1> 1502 store volatile <4 x i16> %a.add, ptr addrspace(1) %a 1503 ret void 1504} 1505 1506define void @global_volatile_8xi16(ptr addrspace(1) %a) { 1507; CHECK-LABEL: global_volatile_8xi16( 1508; CHECK: { 1509; CHECK-NEXT: .reg .b16 %rs<17>; 1510; CHECK-NEXT: .reg .b32 %r<9>; 1511; CHECK-NEXT: .reg .b64 %rd<2>; 1512; CHECK-EMPTY: 1513; CHECK-NEXT: // %bb.0: 1514; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_8xi16_param_0]; 1515; CHECK-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; 1516; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; 1517; CHECK-NEXT: add.s16 %rs3, %rs2, 1; 1518; CHECK-NEXT: add.s16 %rs4, %rs1, 1; 1519; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; 1520; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3; 1521; CHECK-NEXT: add.s16 %rs7, %rs6, 1; 1522; CHECK-NEXT: add.s16 %rs8, %rs5, 1; 1523; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7}; 1524; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2; 1525; CHECK-NEXT: add.s16 %rs11, %rs10, 1; 1526; CHECK-NEXT: add.s16 %rs12, %rs9, 1; 1527; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11}; 1528; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1; 1529; CHECK-NEXT: add.s16 %rs15, %rs14, 1; 1530; CHECK-NEXT: add.s16 %rs16, %rs13, 1; 1531; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15}; 1532; CHECK-NEXT: st.volatile.global.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; 1533; CHECK-NEXT: ret; 1534 %a.load = load volatile <8 x i16>, ptr addrspace(1) %a 1535 %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1536 store volatile <8 x i16> %a.add, ptr addrspace(1) %a 1537 ret void 1538} 1539 1540define void @global_volatile_2xi32(ptr addrspace(1) %a) { 1541; CHECK-LABEL: global_volatile_2xi32( 1542; CHECK: { 1543; CHECK-NEXT: .reg .b32 %r<5>; 1544; CHECK-NEXT: .reg .b64 %rd<2>; 1545; CHECK-EMPTY: 1546; CHECK-NEXT: // %bb.0: 1547; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi32_param_0]; 1548; CHECK-NEXT: ld.volatile.global.v2.u32 {%r1, %r2}, [%rd1]; 1549; CHECK-NEXT: add.s32 %r3, %r2, 1; 1550; CHECK-NEXT: add.s32 %r4, %r1, 1; 1551; CHECK-NEXT: st.volatile.global.v2.u32 [%rd1], {%r4, %r3}; 1552; CHECK-NEXT: ret; 1553 %a.load = load volatile <2 x i32>, ptr addrspace(1) %a 1554 %a.add = add <2 x i32> %a.load, <i32 1, i32 1> 1555 store volatile <2 x i32> %a.add, ptr addrspace(1) %a 1556 ret void 1557} 1558 1559define void @global_volatile_4xi32(ptr addrspace(1) %a) { 1560; CHECK-LABEL: global_volatile_4xi32( 1561; CHECK: { 1562; CHECK-NEXT: .reg .b32 %r<9>; 1563; CHECK-NEXT: .reg .b64 %rd<2>; 1564; CHECK-EMPTY: 1565; CHECK-NEXT: // %bb.0: 1566; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xi32_param_0]; 1567; CHECK-NEXT: ld.volatile.global.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; 1568; CHECK-NEXT: add.s32 %r5, %r4, 1; 1569; CHECK-NEXT: add.s32 %r6, %r3, 1; 1570; CHECK-NEXT: add.s32 %r7, %r2, 1; 1571; CHECK-NEXT: add.s32 %r8, %r1, 1; 1572; CHECK-NEXT: st.volatile.global.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; 1573; CHECK-NEXT: ret; 1574 %a.load = load volatile <4 x i32>, ptr addrspace(1) %a 1575 %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1> 1576 store volatile <4 x i32> %a.add, ptr addrspace(1) %a 1577 ret void 1578} 1579 1580define void @global_volatile_2xi64(ptr addrspace(1) %a) { 1581; CHECK-LABEL: global_volatile_2xi64( 1582; CHECK: { 1583; CHECK-NEXT: .reg .b64 %rd<6>; 1584; CHECK-EMPTY: 1585; CHECK-NEXT: // %bb.0: 1586; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi64_param_0]; 1587; CHECK-NEXT: ld.volatile.global.v2.u64 {%rd2, %rd3}, [%rd1]; 1588; CHECK-NEXT: add.s64 %rd4, %rd3, 1; 1589; CHECK-NEXT: add.s64 %rd5, %rd2, 1; 1590; CHECK-NEXT: st.volatile.global.v2.u64 [%rd1], {%rd5, %rd4}; 1591; CHECK-NEXT: ret; 1592 %a.load = load volatile <2 x i64>, ptr addrspace(1) %a 1593 %a.add = add <2 x i64> %a.load, <i64 1, i64 1> 1594 store volatile <2 x i64> %a.add, ptr addrspace(1) %a 1595 ret void 1596} 1597 1598define void @global_volatile_2xfloat(ptr addrspace(1) %a) { 1599; CHECK-LABEL: global_volatile_2xfloat( 1600; CHECK: { 1601; CHECK-NEXT: .reg .f32 %f<5>; 1602; CHECK-NEXT: .reg .b64 %rd<2>; 1603; CHECK-EMPTY: 1604; CHECK-NEXT: // %bb.0: 1605; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xfloat_param_0]; 1606; CHECK-NEXT: ld.volatile.global.v2.f32 {%f1, %f2}, [%rd1]; 1607; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; 1608; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; 1609; CHECK-NEXT: st.volatile.global.v2.f32 [%rd1], {%f4, %f3}; 1610; CHECK-NEXT: ret; 1611 %a.load = load volatile <2 x float>, ptr addrspace(1) %a 1612 %a.add = fadd <2 x float> %a.load, <float 1., float 1.> 1613 store volatile <2 x float> %a.add, ptr addrspace(1) %a 1614 ret void 1615} 1616 1617define void @global_volatile_4xfloat(ptr addrspace(1) %a) { 1618; CHECK-LABEL: global_volatile_4xfloat( 1619; CHECK: { 1620; CHECK-NEXT: .reg .f32 %f<9>; 1621; CHECK-NEXT: .reg .b64 %rd<2>; 1622; CHECK-EMPTY: 1623; CHECK-NEXT: // %bb.0: 1624; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xfloat_param_0]; 1625; CHECK-NEXT: ld.volatile.global.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; 1626; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; 1627; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; 1628; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; 1629; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; 1630; CHECK-NEXT: st.volatile.global.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; 1631; CHECK-NEXT: ret; 1632 %a.load = load volatile <4 x float>, ptr addrspace(1) %a 1633 %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.> 1634 store volatile <4 x float> %a.add, ptr addrspace(1) %a 1635 ret void 1636} 1637 1638define void @global_volatile_2xdouble(ptr addrspace(1) %a) { 1639; CHECK-LABEL: global_volatile_2xdouble( 1640; CHECK: { 1641; CHECK-NEXT: .reg .b64 %rd<2>; 1642; CHECK-NEXT: .reg .f64 %fd<5>; 1643; CHECK-EMPTY: 1644; CHECK-NEXT: // %bb.0: 1645; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xdouble_param_0]; 1646; CHECK-NEXT: ld.volatile.global.v2.f64 {%fd1, %fd2}, [%rd1]; 1647; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; 1648; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; 1649; CHECK-NEXT: st.volatile.global.v2.f64 [%rd1], {%fd4, %fd3}; 1650; CHECK-NEXT: ret; 1651 %a.load = load volatile <2 x double>, ptr addrspace(1) %a 1652 %a.add = fadd <2 x double> %a.load, <double 1., double 1.> 1653 store volatile <2 x double> %a.add, ptr addrspace(1) %a 1654 ret void 1655} 1656 1657;; shared statespace 1658 1659; shared 1660 1661define void @shared_2xi8(ptr addrspace(3) %a) { 1662; CHECK-LABEL: shared_2xi8( 1663; CHECK: { 1664; CHECK-NEXT: .reg .b16 %rs<5>; 1665; CHECK-NEXT: .reg .b64 %rd<2>; 1666; CHECK-EMPTY: 1667; CHECK-NEXT: // %bb.0: 1668; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xi8_param_0]; 1669; CHECK-NEXT: ld.shared.v2.u8 {%rs1, %rs2}, [%rd1]; 1670; CHECK-NEXT: add.s16 %rs3, %rs2, 1; 1671; CHECK-NEXT: add.s16 %rs4, %rs1, 1; 1672; CHECK-NEXT: st.shared.v2.u8 [%rd1], {%rs4, %rs3}; 1673; CHECK-NEXT: ret; 1674 %a.load = load <2 x i8>, ptr addrspace(3) %a 1675 %a.add = add <2 x i8> %a.load, <i8 1, i8 1> 1676 store <2 x i8> %a.add, ptr addrspace(3) %a 1677 ret void 1678} 1679 1680define void @shared_4xi8(ptr addrspace(3) %a) { 1681; CHECK-LABEL: shared_4xi8( 1682; CHECK: { 1683; CHECK-NEXT: .reg .b16 %rs<9>; 1684; CHECK-NEXT: .reg .b32 %r<13>; 1685; CHECK-NEXT: .reg .b64 %rd<2>; 1686; CHECK-EMPTY: 1687; CHECK-NEXT: // %bb.0: 1688; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xi8_param_0]; 1689; CHECK-NEXT: ld.shared.u32 %r1, [%rd1]; 1690; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; 1691; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; 1692; CHECK-NEXT: add.s16 %rs2, %rs1, 1; 1693; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; 1694; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; 1695; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; 1696; CHECK-NEXT: add.s16 %rs4, %rs3, 1; 1697; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; 1698; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; 1699; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; 1700; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; 1701; CHECK-NEXT: add.s16 %rs6, %rs5, 1; 1702; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; 1703; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; 1704; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; 1705; CHECK-NEXT: add.s16 %rs8, %rs7, 1; 1706; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; 1707; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; 1708; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; 1709; CHECK-NEXT: st.shared.u32 [%rd1], %r12; 1710; CHECK-NEXT: ret; 1711 %a.load = load <4 x i8>, ptr addrspace(3) %a 1712 %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1> 1713 store <4 x i8> %a.add, ptr addrspace(3) %a 1714 ret void 1715} 1716 1717define void @shared_8xi8(ptr addrspace(3) %a) { 1718; CHECK-LABEL: shared_8xi8( 1719; CHECK: { 1720; CHECK-NEXT: .reg .b16 %rs<17>; 1721; CHECK-NEXT: .reg .b32 %r<25>; 1722; CHECK-NEXT: .reg .b64 %rd<2>; 1723; CHECK-EMPTY: 1724; CHECK-NEXT: // %bb.0: 1725; CHECK-NEXT: ld.param.u64 %rd1, [shared_8xi8_param_0]; 1726; CHECK-NEXT: ld.shared.v2.b32 {%r1, %r2}, [%rd1]; 1727; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; 1728; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; 1729; CHECK-NEXT: add.s16 %rs2, %rs1, 1; 1730; CHECK-NEXT: cvt.u32.u16 %r4, %rs2; 1731; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; 1732; CHECK-NEXT: cvt.u16.u32 %rs3, %r5; 1733; CHECK-NEXT: add.s16 %rs4, %rs3, 1; 1734; CHECK-NEXT: cvt.u32.u16 %r6, %rs4; 1735; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U; 1736; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; 1737; CHECK-NEXT: cvt.u16.u32 %rs5, %r8; 1738; CHECK-NEXT: add.s16 %rs6, %rs5, 1; 1739; CHECK-NEXT: cvt.u32.u16 %r9, %rs6; 1740; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; 1741; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; 1742; CHECK-NEXT: add.s16 %rs8, %rs7, 1; 1743; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; 1744; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U; 1745; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U; 1746; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; 1747; CHECK-NEXT: cvt.u16.u32 %rs9, %r14; 1748; CHECK-NEXT: add.s16 %rs10, %rs9, 1; 1749; CHECK-NEXT: cvt.u32.u16 %r15, %rs10; 1750; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8; 1751; CHECK-NEXT: cvt.u16.u32 %rs11, %r16; 1752; CHECK-NEXT: add.s16 %rs12, %rs11, 1; 1753; CHECK-NEXT: cvt.u32.u16 %r17, %rs12; 1754; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U; 1755; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8; 1756; CHECK-NEXT: cvt.u16.u32 %rs13, %r19; 1757; CHECK-NEXT: add.s16 %rs14, %rs13, 1; 1758; CHECK-NEXT: cvt.u32.u16 %r20, %rs14; 1759; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8; 1760; CHECK-NEXT: cvt.u16.u32 %rs15, %r21; 1761; CHECK-NEXT: add.s16 %rs16, %rs15, 1; 1762; CHECK-NEXT: cvt.u32.u16 %r22, %rs16; 1763; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U; 1764; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U; 1765; CHECK-NEXT: st.shared.v2.b32 [%rd1], {%r24, %r13}; 1766; CHECK-NEXT: ret; 1767 %a.load = load <8 x i8>, ptr addrspace(3) %a 1768 %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 1769 store <8 x i8> %a.add, ptr addrspace(3) %a 1770 ret void 1771} 1772 1773define void @shared_16xi8(ptr addrspace(3) %a) { 1774; CHECK-LABEL: shared_16xi8( 1775; CHECK: { 1776; CHECK-NEXT: .reg .b16 %rs<33>; 1777; CHECK-NEXT: .reg .b32 %r<49>; 1778; CHECK-NEXT: .reg .b64 %rd<2>; 1779; CHECK-EMPTY: 1780; CHECK-NEXT: // %bb.0: 1781; CHECK-NEXT: ld.param.u64 %rd1, [shared_16xi8_param_0]; 1782; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; 1783; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; 1784; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; 1785; CHECK-NEXT: add.s16 %rs2, %rs1, 1; 1786; CHECK-NEXT: cvt.u32.u16 %r6, %rs2; 1787; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8; 1788; CHECK-NEXT: cvt.u16.u32 %rs3, %r7; 1789; CHECK-NEXT: add.s16 %rs4, %rs3, 1; 1790; CHECK-NEXT: cvt.u32.u16 %r8, %rs4; 1791; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U; 1792; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8; 1793; CHECK-NEXT: cvt.u16.u32 %rs5, %r10; 1794; CHECK-NEXT: add.s16 %rs6, %rs5, 1; 1795; CHECK-NEXT: cvt.u32.u16 %r11, %rs6; 1796; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8; 1797; CHECK-NEXT: cvt.u16.u32 %rs7, %r12; 1798; CHECK-NEXT: add.s16 %rs8, %rs7, 1; 1799; CHECK-NEXT: cvt.u32.u16 %r13, %rs8; 1800; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U; 1801; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U; 1802; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8; 1803; CHECK-NEXT: cvt.u16.u32 %rs9, %r16; 1804; CHECK-NEXT: add.s16 %rs10, %rs9, 1; 1805; CHECK-NEXT: cvt.u32.u16 %r17, %rs10; 1806; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8; 1807; CHECK-NEXT: cvt.u16.u32 %rs11, %r18; 1808; CHECK-NEXT: add.s16 %rs12, %rs11, 1; 1809; CHECK-NEXT: cvt.u32.u16 %r19, %rs12; 1810; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U; 1811; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8; 1812; CHECK-NEXT: cvt.u16.u32 %rs13, %r21; 1813; CHECK-NEXT: add.s16 %rs14, %rs13, 1; 1814; CHECK-NEXT: cvt.u32.u16 %r22, %rs14; 1815; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8; 1816; CHECK-NEXT: cvt.u16.u32 %rs15, %r23; 1817; CHECK-NEXT: add.s16 %rs16, %rs15, 1; 1818; CHECK-NEXT: cvt.u32.u16 %r24, %rs16; 1819; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U; 1820; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U; 1821; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8; 1822; CHECK-NEXT: cvt.u16.u32 %rs17, %r27; 1823; CHECK-NEXT: add.s16 %rs18, %rs17, 1; 1824; CHECK-NEXT: cvt.u32.u16 %r28, %rs18; 1825; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8; 1826; CHECK-NEXT: cvt.u16.u32 %rs19, %r29; 1827; CHECK-NEXT: add.s16 %rs20, %rs19, 1; 1828; CHECK-NEXT: cvt.u32.u16 %r30, %rs20; 1829; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U; 1830; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8; 1831; CHECK-NEXT: cvt.u16.u32 %rs21, %r32; 1832; CHECK-NEXT: add.s16 %rs22, %rs21, 1; 1833; CHECK-NEXT: cvt.u32.u16 %r33, %rs22; 1834; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8; 1835; CHECK-NEXT: cvt.u16.u32 %rs23, %r34; 1836; CHECK-NEXT: add.s16 %rs24, %rs23, 1; 1837; CHECK-NEXT: cvt.u32.u16 %r35, %rs24; 1838; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U; 1839; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U; 1840; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8; 1841; CHECK-NEXT: cvt.u16.u32 %rs25, %r38; 1842; CHECK-NEXT: add.s16 %rs26, %rs25, 1; 1843; CHECK-NEXT: cvt.u32.u16 %r39, %rs26; 1844; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8; 1845; CHECK-NEXT: cvt.u16.u32 %rs27, %r40; 1846; CHECK-NEXT: add.s16 %rs28, %rs27, 1; 1847; CHECK-NEXT: cvt.u32.u16 %r41, %rs28; 1848; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U; 1849; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8; 1850; CHECK-NEXT: cvt.u16.u32 %rs29, %r43; 1851; CHECK-NEXT: add.s16 %rs30, %rs29, 1; 1852; CHECK-NEXT: cvt.u32.u16 %r44, %rs30; 1853; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8; 1854; CHECK-NEXT: cvt.u16.u32 %rs31, %r45; 1855; CHECK-NEXT: add.s16 %rs32, %rs31, 1; 1856; CHECK-NEXT: cvt.u32.u16 %r46, %rs32; 1857; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U; 1858; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U; 1859; CHECK-NEXT: st.shared.v4.b32 [%rd1], {%r48, %r37, %r26, %r15}; 1860; CHECK-NEXT: ret; 1861 %a.load = load <16 x i8>, ptr addrspace(3) %a 1862 %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 1863 store <16 x i8> %a.add, ptr addrspace(3) %a 1864 ret void 1865} 1866 1867define void @shared_2xi16(ptr addrspace(3) %a) { 1868; CHECK-LABEL: shared_2xi16( 1869; CHECK: { 1870; CHECK-NEXT: .reg .b16 %rs<5>; 1871; CHECK-NEXT: .reg .b32 %r<3>; 1872; CHECK-NEXT: .reg .b64 %rd<2>; 1873; CHECK-EMPTY: 1874; CHECK-NEXT: // %bb.0: 1875; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xi16_param_0]; 1876; CHECK-NEXT: ld.shared.u32 %r1, [%rd1]; 1877; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; 1878; CHECK-NEXT: add.s16 %rs3, %rs2, 1; 1879; CHECK-NEXT: add.s16 %rs4, %rs1, 1; 1880; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; 1881; CHECK-NEXT: st.shared.u32 [%rd1], %r2; 1882; CHECK-NEXT: ret; 1883 %a.load = load <2 x i16>, ptr addrspace(3) %a 1884 %a.add = add <2 x i16> %a.load, <i16 1, i16 1> 1885 store <2 x i16> %a.add, ptr addrspace(3) %a 1886 ret void 1887} 1888 1889define void @shared_4xi16(ptr addrspace(3) %a) { 1890; CHECK-LABEL: shared_4xi16( 1891; CHECK: { 1892; CHECK-NEXT: .reg .b16 %rs<9>; 1893; CHECK-NEXT: .reg .b64 %rd<2>; 1894; CHECK-EMPTY: 1895; CHECK-NEXT: // %bb.0: 1896; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xi16_param_0]; 1897; CHECK-NEXT: ld.shared.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; 1898; CHECK-NEXT: add.s16 %rs5, %rs4, 1; 1899; CHECK-NEXT: add.s16 %rs6, %rs3, 1; 1900; CHECK-NEXT: add.s16 %rs7, %rs2, 1; 1901; CHECK-NEXT: add.s16 %rs8, %rs1, 1; 1902; CHECK-NEXT: st.shared.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; 1903; CHECK-NEXT: ret; 1904 %a.load = load <4 x i16>, ptr addrspace(3) %a 1905 %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1> 1906 store <4 x i16> %a.add, ptr addrspace(3) %a 1907 ret void 1908} 1909 1910define void @shared_8xi16(ptr addrspace(3) %a) { 1911; CHECK-LABEL: shared_8xi16( 1912; CHECK: { 1913; CHECK-NEXT: .reg .b16 %rs<17>; 1914; CHECK-NEXT: .reg .b32 %r<9>; 1915; CHECK-NEXT: .reg .b64 %rd<2>; 1916; CHECK-EMPTY: 1917; CHECK-NEXT: // %bb.0: 1918; CHECK-NEXT: ld.param.u64 %rd1, [shared_8xi16_param_0]; 1919; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; 1920; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; 1921; CHECK-NEXT: add.s16 %rs3, %rs2, 1; 1922; CHECK-NEXT: add.s16 %rs4, %rs1, 1; 1923; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; 1924; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3; 1925; CHECK-NEXT: add.s16 %rs7, %rs6, 1; 1926; CHECK-NEXT: add.s16 %rs8, %rs5, 1; 1927; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7}; 1928; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2; 1929; CHECK-NEXT: add.s16 %rs11, %rs10, 1; 1930; CHECK-NEXT: add.s16 %rs12, %rs9, 1; 1931; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11}; 1932; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1; 1933; CHECK-NEXT: add.s16 %rs15, %rs14, 1; 1934; CHECK-NEXT: add.s16 %rs16, %rs13, 1; 1935; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15}; 1936; CHECK-NEXT: st.shared.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; 1937; CHECK-NEXT: ret; 1938 %a.load = load <8 x i16>, ptr addrspace(3) %a 1939 %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 1940 store <8 x i16> %a.add, ptr addrspace(3) %a 1941 ret void 1942} 1943 1944define void @shared_2xi32(ptr addrspace(3) %a) { 1945; CHECK-LABEL: shared_2xi32( 1946; CHECK: { 1947; CHECK-NEXT: .reg .b32 %r<5>; 1948; CHECK-NEXT: .reg .b64 %rd<2>; 1949; CHECK-EMPTY: 1950; CHECK-NEXT: // %bb.0: 1951; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xi32_param_0]; 1952; CHECK-NEXT: ld.shared.v2.u32 {%r1, %r2}, [%rd1]; 1953; CHECK-NEXT: add.s32 %r3, %r2, 1; 1954; CHECK-NEXT: add.s32 %r4, %r1, 1; 1955; CHECK-NEXT: st.shared.v2.u32 [%rd1], {%r4, %r3}; 1956; CHECK-NEXT: ret; 1957 %a.load = load <2 x i32>, ptr addrspace(3) %a 1958 %a.add = add <2 x i32> %a.load, <i32 1, i32 1> 1959 store <2 x i32> %a.add, ptr addrspace(3) %a 1960 ret void 1961} 1962 1963define void @shared_4xi32(ptr addrspace(3) %a) { 1964; CHECK-LABEL: shared_4xi32( 1965; CHECK: { 1966; CHECK-NEXT: .reg .b32 %r<9>; 1967; CHECK-NEXT: .reg .b64 %rd<2>; 1968; CHECK-EMPTY: 1969; CHECK-NEXT: // %bb.0: 1970; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xi32_param_0]; 1971; CHECK-NEXT: ld.shared.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; 1972; CHECK-NEXT: add.s32 %r5, %r4, 1; 1973; CHECK-NEXT: add.s32 %r6, %r3, 1; 1974; CHECK-NEXT: add.s32 %r7, %r2, 1; 1975; CHECK-NEXT: add.s32 %r8, %r1, 1; 1976; CHECK-NEXT: st.shared.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; 1977; CHECK-NEXT: ret; 1978 %a.load = load <4 x i32>, ptr addrspace(3) %a 1979 %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1> 1980 store <4 x i32> %a.add, ptr addrspace(3) %a 1981 ret void 1982} 1983 1984define void @shared_2xi64(ptr addrspace(3) %a) { 1985; CHECK-LABEL: shared_2xi64( 1986; CHECK: { 1987; CHECK-NEXT: .reg .b64 %rd<6>; 1988; CHECK-EMPTY: 1989; CHECK-NEXT: // %bb.0: 1990; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xi64_param_0]; 1991; CHECK-NEXT: ld.shared.v2.u64 {%rd2, %rd3}, [%rd1]; 1992; CHECK-NEXT: add.s64 %rd4, %rd3, 1; 1993; CHECK-NEXT: add.s64 %rd5, %rd2, 1; 1994; CHECK-NEXT: st.shared.v2.u64 [%rd1], {%rd5, %rd4}; 1995; CHECK-NEXT: ret; 1996 %a.load = load <2 x i64>, ptr addrspace(3) %a 1997 %a.add = add <2 x i64> %a.load, <i64 1, i64 1> 1998 store <2 x i64> %a.add, ptr addrspace(3) %a 1999 ret void 2000} 2001 2002define void @shared_2xfloat(ptr addrspace(3) %a) { 2003; CHECK-LABEL: shared_2xfloat( 2004; CHECK: { 2005; CHECK-NEXT: .reg .f32 %f<5>; 2006; CHECK-NEXT: .reg .b64 %rd<2>; 2007; CHECK-EMPTY: 2008; CHECK-NEXT: // %bb.0: 2009; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xfloat_param_0]; 2010; CHECK-NEXT: ld.shared.v2.f32 {%f1, %f2}, [%rd1]; 2011; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; 2012; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; 2013; CHECK-NEXT: st.shared.v2.f32 [%rd1], {%f4, %f3}; 2014; CHECK-NEXT: ret; 2015 %a.load = load <2 x float>, ptr addrspace(3) %a 2016 %a.add = fadd <2 x float> %a.load, <float 1., float 1.> 2017 store <2 x float> %a.add, ptr addrspace(3) %a 2018 ret void 2019} 2020 2021define void @shared_4xfloat(ptr addrspace(3) %a) { 2022; CHECK-LABEL: shared_4xfloat( 2023; CHECK: { 2024; CHECK-NEXT: .reg .f32 %f<9>; 2025; CHECK-NEXT: .reg .b64 %rd<2>; 2026; CHECK-EMPTY: 2027; CHECK-NEXT: // %bb.0: 2028; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xfloat_param_0]; 2029; CHECK-NEXT: ld.shared.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; 2030; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; 2031; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; 2032; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; 2033; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; 2034; CHECK-NEXT: st.shared.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; 2035; CHECK-NEXT: ret; 2036 %a.load = load <4 x float>, ptr addrspace(3) %a 2037 %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.> 2038 store <4 x float> %a.add, ptr addrspace(3) %a 2039 ret void 2040} 2041 2042define void @shared_2xdouble(ptr addrspace(3) %a) { 2043; CHECK-LABEL: shared_2xdouble( 2044; CHECK: { 2045; CHECK-NEXT: .reg .b64 %rd<2>; 2046; CHECK-NEXT: .reg .f64 %fd<5>; 2047; CHECK-EMPTY: 2048; CHECK-NEXT: // %bb.0: 2049; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xdouble_param_0]; 2050; CHECK-NEXT: ld.shared.v2.f64 {%fd1, %fd2}, [%rd1]; 2051; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; 2052; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; 2053; CHECK-NEXT: st.shared.v2.f64 [%rd1], {%fd4, %fd3}; 2054; CHECK-NEXT: ret; 2055 %a.load = load <2 x double>, ptr addrspace(3) %a 2056 %a.add = fadd <2 x double> %a.load, <double 1., double 1.> 2057 store <2 x double> %a.add, ptr addrspace(3) %a 2058 ret void 2059} 2060 2061; shared_volatile 2062 2063define void @shared_volatile_2xi8(ptr addrspace(3) %a) { 2064; CHECK-LABEL: shared_volatile_2xi8( 2065; CHECK: { 2066; CHECK-NEXT: .reg .b16 %rs<5>; 2067; CHECK-NEXT: .reg .b64 %rd<2>; 2068; CHECK-EMPTY: 2069; CHECK-NEXT: // %bb.0: 2070; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi8_param_0]; 2071; CHECK-NEXT: ld.volatile.shared.v2.u8 {%rs1, %rs2}, [%rd1]; 2072; CHECK-NEXT: add.s16 %rs3, %rs2, 1; 2073; CHECK-NEXT: add.s16 %rs4, %rs1, 1; 2074; CHECK-NEXT: st.volatile.shared.v2.u8 [%rd1], {%rs4, %rs3}; 2075; CHECK-NEXT: ret; 2076 %a.load = load volatile <2 x i8>, ptr addrspace(3) %a 2077 %a.add = add <2 x i8> %a.load, <i8 1, i8 1> 2078 store volatile <2 x i8> %a.add, ptr addrspace(3) %a 2079 ret void 2080} 2081 2082define void @shared_volatile_4xi8(ptr addrspace(3) %a) { 2083; CHECK-LABEL: shared_volatile_4xi8( 2084; CHECK: { 2085; CHECK-NEXT: .reg .b16 %rs<9>; 2086; CHECK-NEXT: .reg .b32 %r<13>; 2087; CHECK-NEXT: .reg .b64 %rd<2>; 2088; CHECK-EMPTY: 2089; CHECK-NEXT: // %bb.0: 2090; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xi8_param_0]; 2091; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1]; 2092; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; 2093; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; 2094; CHECK-NEXT: add.s16 %rs2, %rs1, 1; 2095; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; 2096; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; 2097; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; 2098; CHECK-NEXT: add.s16 %rs4, %rs3, 1; 2099; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; 2100; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; 2101; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; 2102; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; 2103; CHECK-NEXT: add.s16 %rs6, %rs5, 1; 2104; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; 2105; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; 2106; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; 2107; CHECK-NEXT: add.s16 %rs8, %rs7, 1; 2108; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; 2109; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; 2110; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; 2111; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r12; 2112; CHECK-NEXT: ret; 2113 %a.load = load volatile <4 x i8>, ptr addrspace(3) %a 2114 %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1> 2115 store volatile <4 x i8> %a.add, ptr addrspace(3) %a 2116 ret void 2117} 2118 2119define void @shared_volatile_8xi8(ptr addrspace(3) %a) { 2120; CHECK-LABEL: shared_volatile_8xi8( 2121; CHECK: { 2122; CHECK-NEXT: .reg .b16 %rs<17>; 2123; CHECK-NEXT: .reg .b32 %r<25>; 2124; CHECK-NEXT: .reg .b64 %rd<2>; 2125; CHECK-EMPTY: 2126; CHECK-NEXT: // %bb.0: 2127; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_8xi8_param_0]; 2128; CHECK-NEXT: ld.volatile.shared.v2.b32 {%r1, %r2}, [%rd1]; 2129; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; 2130; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; 2131; CHECK-NEXT: add.s16 %rs2, %rs1, 1; 2132; CHECK-NEXT: cvt.u32.u16 %r4, %rs2; 2133; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; 2134; CHECK-NEXT: cvt.u16.u32 %rs3, %r5; 2135; CHECK-NEXT: add.s16 %rs4, %rs3, 1; 2136; CHECK-NEXT: cvt.u32.u16 %r6, %rs4; 2137; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U; 2138; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; 2139; CHECK-NEXT: cvt.u16.u32 %rs5, %r8; 2140; CHECK-NEXT: add.s16 %rs6, %rs5, 1; 2141; CHECK-NEXT: cvt.u32.u16 %r9, %rs6; 2142; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; 2143; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; 2144; CHECK-NEXT: add.s16 %rs8, %rs7, 1; 2145; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; 2146; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U; 2147; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U; 2148; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; 2149; CHECK-NEXT: cvt.u16.u32 %rs9, %r14; 2150; CHECK-NEXT: add.s16 %rs10, %rs9, 1; 2151; CHECK-NEXT: cvt.u32.u16 %r15, %rs10; 2152; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8; 2153; CHECK-NEXT: cvt.u16.u32 %rs11, %r16; 2154; CHECK-NEXT: add.s16 %rs12, %rs11, 1; 2155; CHECK-NEXT: cvt.u32.u16 %r17, %rs12; 2156; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U; 2157; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8; 2158; CHECK-NEXT: cvt.u16.u32 %rs13, %r19; 2159; CHECK-NEXT: add.s16 %rs14, %rs13, 1; 2160; CHECK-NEXT: cvt.u32.u16 %r20, %rs14; 2161; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8; 2162; CHECK-NEXT: cvt.u16.u32 %rs15, %r21; 2163; CHECK-NEXT: add.s16 %rs16, %rs15, 1; 2164; CHECK-NEXT: cvt.u32.u16 %r22, %rs16; 2165; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U; 2166; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U; 2167; CHECK-NEXT: st.volatile.shared.v2.b32 [%rd1], {%r24, %r13}; 2168; CHECK-NEXT: ret; 2169 %a.load = load volatile <8 x i8>, ptr addrspace(3) %a 2170 %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 2171 store volatile <8 x i8> %a.add, ptr addrspace(3) %a 2172 ret void 2173} 2174 2175define void @shared_volatile_16xi8(ptr addrspace(3) %a) { 2176; CHECK-LABEL: shared_volatile_16xi8( 2177; CHECK: { 2178; CHECK-NEXT: .reg .b16 %rs<33>; 2179; CHECK-NEXT: .reg .b32 %r<49>; 2180; CHECK-NEXT: .reg .b64 %rd<2>; 2181; CHECK-EMPTY: 2182; CHECK-NEXT: // %bb.0: 2183; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_16xi8_param_0]; 2184; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; 2185; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; 2186; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; 2187; CHECK-NEXT: add.s16 %rs2, %rs1, 1; 2188; CHECK-NEXT: cvt.u32.u16 %r6, %rs2; 2189; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8; 2190; CHECK-NEXT: cvt.u16.u32 %rs3, %r7; 2191; CHECK-NEXT: add.s16 %rs4, %rs3, 1; 2192; CHECK-NEXT: cvt.u32.u16 %r8, %rs4; 2193; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U; 2194; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8; 2195; CHECK-NEXT: cvt.u16.u32 %rs5, %r10; 2196; CHECK-NEXT: add.s16 %rs6, %rs5, 1; 2197; CHECK-NEXT: cvt.u32.u16 %r11, %rs6; 2198; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8; 2199; CHECK-NEXT: cvt.u16.u32 %rs7, %r12; 2200; CHECK-NEXT: add.s16 %rs8, %rs7, 1; 2201; CHECK-NEXT: cvt.u32.u16 %r13, %rs8; 2202; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U; 2203; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U; 2204; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8; 2205; CHECK-NEXT: cvt.u16.u32 %rs9, %r16; 2206; CHECK-NEXT: add.s16 %rs10, %rs9, 1; 2207; CHECK-NEXT: cvt.u32.u16 %r17, %rs10; 2208; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8; 2209; CHECK-NEXT: cvt.u16.u32 %rs11, %r18; 2210; CHECK-NEXT: add.s16 %rs12, %rs11, 1; 2211; CHECK-NEXT: cvt.u32.u16 %r19, %rs12; 2212; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U; 2213; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8; 2214; CHECK-NEXT: cvt.u16.u32 %rs13, %r21; 2215; CHECK-NEXT: add.s16 %rs14, %rs13, 1; 2216; CHECK-NEXT: cvt.u32.u16 %r22, %rs14; 2217; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8; 2218; CHECK-NEXT: cvt.u16.u32 %rs15, %r23; 2219; CHECK-NEXT: add.s16 %rs16, %rs15, 1; 2220; CHECK-NEXT: cvt.u32.u16 %r24, %rs16; 2221; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U; 2222; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U; 2223; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8; 2224; CHECK-NEXT: cvt.u16.u32 %rs17, %r27; 2225; CHECK-NEXT: add.s16 %rs18, %rs17, 1; 2226; CHECK-NEXT: cvt.u32.u16 %r28, %rs18; 2227; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8; 2228; CHECK-NEXT: cvt.u16.u32 %rs19, %r29; 2229; CHECK-NEXT: add.s16 %rs20, %rs19, 1; 2230; CHECK-NEXT: cvt.u32.u16 %r30, %rs20; 2231; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U; 2232; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8; 2233; CHECK-NEXT: cvt.u16.u32 %rs21, %r32; 2234; CHECK-NEXT: add.s16 %rs22, %rs21, 1; 2235; CHECK-NEXT: cvt.u32.u16 %r33, %rs22; 2236; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8; 2237; CHECK-NEXT: cvt.u16.u32 %rs23, %r34; 2238; CHECK-NEXT: add.s16 %rs24, %rs23, 1; 2239; CHECK-NEXT: cvt.u32.u16 %r35, %rs24; 2240; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U; 2241; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U; 2242; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8; 2243; CHECK-NEXT: cvt.u16.u32 %rs25, %r38; 2244; CHECK-NEXT: add.s16 %rs26, %rs25, 1; 2245; CHECK-NEXT: cvt.u32.u16 %r39, %rs26; 2246; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8; 2247; CHECK-NEXT: cvt.u16.u32 %rs27, %r40; 2248; CHECK-NEXT: add.s16 %rs28, %rs27, 1; 2249; CHECK-NEXT: cvt.u32.u16 %r41, %rs28; 2250; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U; 2251; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8; 2252; CHECK-NEXT: cvt.u16.u32 %rs29, %r43; 2253; CHECK-NEXT: add.s16 %rs30, %rs29, 1; 2254; CHECK-NEXT: cvt.u32.u16 %r44, %rs30; 2255; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8; 2256; CHECK-NEXT: cvt.u16.u32 %rs31, %r45; 2257; CHECK-NEXT: add.s16 %rs32, %rs31, 1; 2258; CHECK-NEXT: cvt.u32.u16 %r46, %rs32; 2259; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U; 2260; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U; 2261; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd1], {%r48, %r37, %r26, %r15}; 2262; CHECK-NEXT: ret; 2263 %a.load = load volatile <16 x i8>, ptr addrspace(3) %a 2264 %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 2265 store volatile <16 x i8> %a.add, ptr addrspace(3) %a 2266 ret void 2267} 2268 2269define void @shared_volatile_2xi16(ptr addrspace(3) %a) { 2270; CHECK-LABEL: shared_volatile_2xi16( 2271; CHECK: { 2272; CHECK-NEXT: .reg .b16 %rs<5>; 2273; CHECK-NEXT: .reg .b32 %r<3>; 2274; CHECK-NEXT: .reg .b64 %rd<2>; 2275; CHECK-EMPTY: 2276; CHECK-NEXT: // %bb.0: 2277; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi16_param_0]; 2278; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1]; 2279; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; 2280; CHECK-NEXT: add.s16 %rs3, %rs2, 1; 2281; CHECK-NEXT: add.s16 %rs4, %rs1, 1; 2282; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; 2283; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r2; 2284; CHECK-NEXT: ret; 2285 %a.load = load volatile <2 x i16>, ptr addrspace(3) %a 2286 %a.add = add <2 x i16> %a.load, <i16 1, i16 1> 2287 store volatile <2 x i16> %a.add, ptr addrspace(3) %a 2288 ret void 2289} 2290 2291define void @shared_volatile_4xi16(ptr addrspace(3) %a) { 2292; CHECK-LABEL: shared_volatile_4xi16( 2293; CHECK: { 2294; CHECK-NEXT: .reg .b16 %rs<9>; 2295; CHECK-NEXT: .reg .b64 %rd<2>; 2296; CHECK-EMPTY: 2297; CHECK-NEXT: // %bb.0: 2298; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xi16_param_0]; 2299; CHECK-NEXT: ld.volatile.shared.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; 2300; CHECK-NEXT: add.s16 %rs5, %rs4, 1; 2301; CHECK-NEXT: add.s16 %rs6, %rs3, 1; 2302; CHECK-NEXT: add.s16 %rs7, %rs2, 1; 2303; CHECK-NEXT: add.s16 %rs8, %rs1, 1; 2304; CHECK-NEXT: st.volatile.shared.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; 2305; CHECK-NEXT: ret; 2306 %a.load = load volatile <4 x i16>, ptr addrspace(3) %a 2307 %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1> 2308 store volatile <4 x i16> %a.add, ptr addrspace(3) %a 2309 ret void 2310} 2311 2312define void @shared_volatile_8xi16(ptr addrspace(3) %a) { 2313; CHECK-LABEL: shared_volatile_8xi16( 2314; CHECK: { 2315; CHECK-NEXT: .reg .b16 %rs<17>; 2316; CHECK-NEXT: .reg .b32 %r<9>; 2317; CHECK-NEXT: .reg .b64 %rd<2>; 2318; CHECK-EMPTY: 2319; CHECK-NEXT: // %bb.0: 2320; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_8xi16_param_0]; 2321; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; 2322; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; 2323; CHECK-NEXT: add.s16 %rs3, %rs2, 1; 2324; CHECK-NEXT: add.s16 %rs4, %rs1, 1; 2325; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; 2326; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3; 2327; CHECK-NEXT: add.s16 %rs7, %rs6, 1; 2328; CHECK-NEXT: add.s16 %rs8, %rs5, 1; 2329; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7}; 2330; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2; 2331; CHECK-NEXT: add.s16 %rs11, %rs10, 1; 2332; CHECK-NEXT: add.s16 %rs12, %rs9, 1; 2333; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11}; 2334; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1; 2335; CHECK-NEXT: add.s16 %rs15, %rs14, 1; 2336; CHECK-NEXT: add.s16 %rs16, %rs13, 1; 2337; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15}; 2338; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; 2339; CHECK-NEXT: ret; 2340 %a.load = load volatile <8 x i16>, ptr addrspace(3) %a 2341 %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 2342 store volatile <8 x i16> %a.add, ptr addrspace(3) %a 2343 ret void 2344} 2345 2346define void @shared_volatile_2xi32(ptr addrspace(3) %a) { 2347; CHECK-LABEL: shared_volatile_2xi32( 2348; CHECK: { 2349; CHECK-NEXT: .reg .b32 %r<5>; 2350; CHECK-NEXT: .reg .b64 %rd<2>; 2351; CHECK-EMPTY: 2352; CHECK-NEXT: // %bb.0: 2353; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi32_param_0]; 2354; CHECK-NEXT: ld.volatile.shared.v2.u32 {%r1, %r2}, [%rd1]; 2355; CHECK-NEXT: add.s32 %r3, %r2, 1; 2356; CHECK-NEXT: add.s32 %r4, %r1, 1; 2357; CHECK-NEXT: st.volatile.shared.v2.u32 [%rd1], {%r4, %r3}; 2358; CHECK-NEXT: ret; 2359 %a.load = load volatile <2 x i32>, ptr addrspace(3) %a 2360 %a.add = add <2 x i32> %a.load, <i32 1, i32 1> 2361 store volatile <2 x i32> %a.add, ptr addrspace(3) %a 2362 ret void 2363} 2364 2365define void @shared_volatile_4xi32(ptr addrspace(3) %a) { 2366; CHECK-LABEL: shared_volatile_4xi32( 2367; CHECK: { 2368; CHECK-NEXT: .reg .b32 %r<9>; 2369; CHECK-NEXT: .reg .b64 %rd<2>; 2370; CHECK-EMPTY: 2371; CHECK-NEXT: // %bb.0: 2372; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xi32_param_0]; 2373; CHECK-NEXT: ld.volatile.shared.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; 2374; CHECK-NEXT: add.s32 %r5, %r4, 1; 2375; CHECK-NEXT: add.s32 %r6, %r3, 1; 2376; CHECK-NEXT: add.s32 %r7, %r2, 1; 2377; CHECK-NEXT: add.s32 %r8, %r1, 1; 2378; CHECK-NEXT: st.volatile.shared.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; 2379; CHECK-NEXT: ret; 2380 %a.load = load volatile <4 x i32>, ptr addrspace(3) %a 2381 %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1> 2382 store volatile <4 x i32> %a.add, ptr addrspace(3) %a 2383 ret void 2384} 2385 2386define void @shared_volatile_2xi64(ptr addrspace(3) %a) { 2387; CHECK-LABEL: shared_volatile_2xi64( 2388; CHECK: { 2389; CHECK-NEXT: .reg .b64 %rd<6>; 2390; CHECK-EMPTY: 2391; CHECK-NEXT: // %bb.0: 2392; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi64_param_0]; 2393; CHECK-NEXT: ld.volatile.shared.v2.u64 {%rd2, %rd3}, [%rd1]; 2394; CHECK-NEXT: add.s64 %rd4, %rd3, 1; 2395; CHECK-NEXT: add.s64 %rd5, %rd2, 1; 2396; CHECK-NEXT: st.volatile.shared.v2.u64 [%rd1], {%rd5, %rd4}; 2397; CHECK-NEXT: ret; 2398 %a.load = load volatile <2 x i64>, ptr addrspace(3) %a 2399 %a.add = add <2 x i64> %a.load, <i64 1, i64 1> 2400 store volatile <2 x i64> %a.add, ptr addrspace(3) %a 2401 ret void 2402} 2403 2404define void @shared_volatile_2xfloat(ptr addrspace(3) %a) { 2405; CHECK-LABEL: shared_volatile_2xfloat( 2406; CHECK: { 2407; CHECK-NEXT: .reg .f32 %f<5>; 2408; CHECK-NEXT: .reg .b64 %rd<2>; 2409; CHECK-EMPTY: 2410; CHECK-NEXT: // %bb.0: 2411; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xfloat_param_0]; 2412; CHECK-NEXT: ld.volatile.shared.v2.f32 {%f1, %f2}, [%rd1]; 2413; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; 2414; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; 2415; CHECK-NEXT: st.volatile.shared.v2.f32 [%rd1], {%f4, %f3}; 2416; CHECK-NEXT: ret; 2417 %a.load = load volatile <2 x float>, ptr addrspace(3) %a 2418 %a.add = fadd <2 x float> %a.load, <float 1., float 1.> 2419 store volatile <2 x float> %a.add, ptr addrspace(3) %a 2420 ret void 2421} 2422 2423define void @shared_volatile_4xfloat(ptr addrspace(3) %a) { 2424; CHECK-LABEL: shared_volatile_4xfloat( 2425; CHECK: { 2426; CHECK-NEXT: .reg .f32 %f<9>; 2427; CHECK-NEXT: .reg .b64 %rd<2>; 2428; CHECK-EMPTY: 2429; CHECK-NEXT: // %bb.0: 2430; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xfloat_param_0]; 2431; CHECK-NEXT: ld.volatile.shared.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; 2432; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; 2433; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; 2434; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; 2435; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; 2436; CHECK-NEXT: st.volatile.shared.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; 2437; CHECK-NEXT: ret; 2438 %a.load = load volatile <4 x float>, ptr addrspace(3) %a 2439 %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.> 2440 store volatile <4 x float> %a.add, ptr addrspace(3) %a 2441 ret void 2442} 2443 2444define void @shared_volatile_2xdouble(ptr addrspace(3) %a) { 2445; CHECK-LABEL: shared_volatile_2xdouble( 2446; CHECK: { 2447; CHECK-NEXT: .reg .b64 %rd<2>; 2448; CHECK-NEXT: .reg .f64 %fd<5>; 2449; CHECK-EMPTY: 2450; CHECK-NEXT: // %bb.0: 2451; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xdouble_param_0]; 2452; CHECK-NEXT: ld.volatile.shared.v2.f64 {%fd1, %fd2}, [%rd1]; 2453; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; 2454; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; 2455; CHECK-NEXT: st.volatile.shared.v2.f64 [%rd1], {%fd4, %fd3}; 2456; CHECK-NEXT: ret; 2457 %a.load = load volatile <2 x double>, ptr addrspace(3) %a 2458 %a.add = fadd <2 x double> %a.load, <double 1., double 1.> 2459 store volatile <2 x double> %a.add, ptr addrspace(3) %a 2460 ret void 2461} 2462 2463;; local statespace 2464 2465; local 2466 2467define void @local_2xi8(ptr addrspace(5) %a) { 2468; CHECK-LABEL: local_2xi8( 2469; CHECK: { 2470; CHECK-NEXT: .reg .b16 %rs<5>; 2471; CHECK-NEXT: .reg .b64 %rd<2>; 2472; CHECK-EMPTY: 2473; CHECK-NEXT: // %bb.0: 2474; CHECK-NEXT: ld.param.u64 %rd1, [local_2xi8_param_0]; 2475; CHECK-NEXT: ld.local.v2.u8 {%rs1, %rs2}, [%rd1]; 2476; CHECK-NEXT: add.s16 %rs3, %rs2, 1; 2477; CHECK-NEXT: add.s16 %rs4, %rs1, 1; 2478; CHECK-NEXT: st.local.v2.u8 [%rd1], {%rs4, %rs3}; 2479; CHECK-NEXT: ret; 2480 %a.load = load <2 x i8>, ptr addrspace(5) %a 2481 %a.add = add <2 x i8> %a.load, <i8 1, i8 1> 2482 store <2 x i8> %a.add, ptr addrspace(5) %a 2483 ret void 2484} 2485 2486define void @local_4xi8(ptr addrspace(5) %a) { 2487; CHECK-LABEL: local_4xi8( 2488; CHECK: { 2489; CHECK-NEXT: .reg .b16 %rs<9>; 2490; CHECK-NEXT: .reg .b32 %r<13>; 2491; CHECK-NEXT: .reg .b64 %rd<2>; 2492; CHECK-EMPTY: 2493; CHECK-NEXT: // %bb.0: 2494; CHECK-NEXT: ld.param.u64 %rd1, [local_4xi8_param_0]; 2495; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; 2496; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; 2497; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; 2498; CHECK-NEXT: add.s16 %rs2, %rs1, 1; 2499; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; 2500; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; 2501; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; 2502; CHECK-NEXT: add.s16 %rs4, %rs3, 1; 2503; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; 2504; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; 2505; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; 2506; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; 2507; CHECK-NEXT: add.s16 %rs6, %rs5, 1; 2508; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; 2509; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; 2510; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; 2511; CHECK-NEXT: add.s16 %rs8, %rs7, 1; 2512; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; 2513; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; 2514; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; 2515; CHECK-NEXT: st.local.u32 [%rd1], %r12; 2516; CHECK-NEXT: ret; 2517 %a.load = load <4 x i8>, ptr addrspace(5) %a 2518 %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1> 2519 store <4 x i8> %a.add, ptr addrspace(5) %a 2520 ret void 2521} 2522 2523define void @local_8xi8(ptr addrspace(5) %a) { 2524; CHECK-LABEL: local_8xi8( 2525; CHECK: { 2526; CHECK-NEXT: .reg .b16 %rs<17>; 2527; CHECK-NEXT: .reg .b32 %r<25>; 2528; CHECK-NEXT: .reg .b64 %rd<2>; 2529; CHECK-EMPTY: 2530; CHECK-NEXT: // %bb.0: 2531; CHECK-NEXT: ld.param.u64 %rd1, [local_8xi8_param_0]; 2532; CHECK-NEXT: ld.local.v2.b32 {%r1, %r2}, [%rd1]; 2533; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; 2534; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; 2535; CHECK-NEXT: add.s16 %rs2, %rs1, 1; 2536; CHECK-NEXT: cvt.u32.u16 %r4, %rs2; 2537; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; 2538; CHECK-NEXT: cvt.u16.u32 %rs3, %r5; 2539; CHECK-NEXT: add.s16 %rs4, %rs3, 1; 2540; CHECK-NEXT: cvt.u32.u16 %r6, %rs4; 2541; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U; 2542; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; 2543; CHECK-NEXT: cvt.u16.u32 %rs5, %r8; 2544; CHECK-NEXT: add.s16 %rs6, %rs5, 1; 2545; CHECK-NEXT: cvt.u32.u16 %r9, %rs6; 2546; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; 2547; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; 2548; CHECK-NEXT: add.s16 %rs8, %rs7, 1; 2549; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; 2550; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U; 2551; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U; 2552; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; 2553; CHECK-NEXT: cvt.u16.u32 %rs9, %r14; 2554; CHECK-NEXT: add.s16 %rs10, %rs9, 1; 2555; CHECK-NEXT: cvt.u32.u16 %r15, %rs10; 2556; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8; 2557; CHECK-NEXT: cvt.u16.u32 %rs11, %r16; 2558; CHECK-NEXT: add.s16 %rs12, %rs11, 1; 2559; CHECK-NEXT: cvt.u32.u16 %r17, %rs12; 2560; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U; 2561; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8; 2562; CHECK-NEXT: cvt.u16.u32 %rs13, %r19; 2563; CHECK-NEXT: add.s16 %rs14, %rs13, 1; 2564; CHECK-NEXT: cvt.u32.u16 %r20, %rs14; 2565; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8; 2566; CHECK-NEXT: cvt.u16.u32 %rs15, %r21; 2567; CHECK-NEXT: add.s16 %rs16, %rs15, 1; 2568; CHECK-NEXT: cvt.u32.u16 %r22, %rs16; 2569; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U; 2570; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U; 2571; CHECK-NEXT: st.local.v2.b32 [%rd1], {%r24, %r13}; 2572; CHECK-NEXT: ret; 2573 %a.load = load <8 x i8>, ptr addrspace(5) %a 2574 %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 2575 store <8 x i8> %a.add, ptr addrspace(5) %a 2576 ret void 2577} 2578 2579define void @local_16xi8(ptr addrspace(5) %a) { 2580; CHECK-LABEL: local_16xi8( 2581; CHECK: { 2582; CHECK-NEXT: .reg .b16 %rs<33>; 2583; CHECK-NEXT: .reg .b32 %r<49>; 2584; CHECK-NEXT: .reg .b64 %rd<2>; 2585; CHECK-EMPTY: 2586; CHECK-NEXT: // %bb.0: 2587; CHECK-NEXT: ld.param.u64 %rd1, [local_16xi8_param_0]; 2588; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; 2589; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; 2590; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; 2591; CHECK-NEXT: add.s16 %rs2, %rs1, 1; 2592; CHECK-NEXT: cvt.u32.u16 %r6, %rs2; 2593; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8; 2594; CHECK-NEXT: cvt.u16.u32 %rs3, %r7; 2595; CHECK-NEXT: add.s16 %rs4, %rs3, 1; 2596; CHECK-NEXT: cvt.u32.u16 %r8, %rs4; 2597; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U; 2598; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8; 2599; CHECK-NEXT: cvt.u16.u32 %rs5, %r10; 2600; CHECK-NEXT: add.s16 %rs6, %rs5, 1; 2601; CHECK-NEXT: cvt.u32.u16 %r11, %rs6; 2602; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8; 2603; CHECK-NEXT: cvt.u16.u32 %rs7, %r12; 2604; CHECK-NEXT: add.s16 %rs8, %rs7, 1; 2605; CHECK-NEXT: cvt.u32.u16 %r13, %rs8; 2606; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U; 2607; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U; 2608; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8; 2609; CHECK-NEXT: cvt.u16.u32 %rs9, %r16; 2610; CHECK-NEXT: add.s16 %rs10, %rs9, 1; 2611; CHECK-NEXT: cvt.u32.u16 %r17, %rs10; 2612; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8; 2613; CHECK-NEXT: cvt.u16.u32 %rs11, %r18; 2614; CHECK-NEXT: add.s16 %rs12, %rs11, 1; 2615; CHECK-NEXT: cvt.u32.u16 %r19, %rs12; 2616; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U; 2617; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8; 2618; CHECK-NEXT: cvt.u16.u32 %rs13, %r21; 2619; CHECK-NEXT: add.s16 %rs14, %rs13, 1; 2620; CHECK-NEXT: cvt.u32.u16 %r22, %rs14; 2621; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8; 2622; CHECK-NEXT: cvt.u16.u32 %rs15, %r23; 2623; CHECK-NEXT: add.s16 %rs16, %rs15, 1; 2624; CHECK-NEXT: cvt.u32.u16 %r24, %rs16; 2625; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U; 2626; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U; 2627; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8; 2628; CHECK-NEXT: cvt.u16.u32 %rs17, %r27; 2629; CHECK-NEXT: add.s16 %rs18, %rs17, 1; 2630; CHECK-NEXT: cvt.u32.u16 %r28, %rs18; 2631; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8; 2632; CHECK-NEXT: cvt.u16.u32 %rs19, %r29; 2633; CHECK-NEXT: add.s16 %rs20, %rs19, 1; 2634; CHECK-NEXT: cvt.u32.u16 %r30, %rs20; 2635; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U; 2636; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8; 2637; CHECK-NEXT: cvt.u16.u32 %rs21, %r32; 2638; CHECK-NEXT: add.s16 %rs22, %rs21, 1; 2639; CHECK-NEXT: cvt.u32.u16 %r33, %rs22; 2640; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8; 2641; CHECK-NEXT: cvt.u16.u32 %rs23, %r34; 2642; CHECK-NEXT: add.s16 %rs24, %rs23, 1; 2643; CHECK-NEXT: cvt.u32.u16 %r35, %rs24; 2644; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U; 2645; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U; 2646; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8; 2647; CHECK-NEXT: cvt.u16.u32 %rs25, %r38; 2648; CHECK-NEXT: add.s16 %rs26, %rs25, 1; 2649; CHECK-NEXT: cvt.u32.u16 %r39, %rs26; 2650; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8; 2651; CHECK-NEXT: cvt.u16.u32 %rs27, %r40; 2652; CHECK-NEXT: add.s16 %rs28, %rs27, 1; 2653; CHECK-NEXT: cvt.u32.u16 %r41, %rs28; 2654; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U; 2655; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8; 2656; CHECK-NEXT: cvt.u16.u32 %rs29, %r43; 2657; CHECK-NEXT: add.s16 %rs30, %rs29, 1; 2658; CHECK-NEXT: cvt.u32.u16 %r44, %rs30; 2659; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8; 2660; CHECK-NEXT: cvt.u16.u32 %rs31, %r45; 2661; CHECK-NEXT: add.s16 %rs32, %rs31, 1; 2662; CHECK-NEXT: cvt.u32.u16 %r46, %rs32; 2663; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U; 2664; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U; 2665; CHECK-NEXT: st.local.v4.b32 [%rd1], {%r48, %r37, %r26, %r15}; 2666; CHECK-NEXT: ret; 2667 %a.load = load <16 x i8>, ptr addrspace(5) %a 2668 %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 2669 store <16 x i8> %a.add, ptr addrspace(5) %a 2670 ret void 2671} 2672 2673define void @local_2xi16(ptr addrspace(5) %a) { 2674; CHECK-LABEL: local_2xi16( 2675; CHECK: { 2676; CHECK-NEXT: .reg .b16 %rs<5>; 2677; CHECK-NEXT: .reg .b32 %r<3>; 2678; CHECK-NEXT: .reg .b64 %rd<2>; 2679; CHECK-EMPTY: 2680; CHECK-NEXT: // %bb.0: 2681; CHECK-NEXT: ld.param.u64 %rd1, [local_2xi16_param_0]; 2682; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; 2683; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; 2684; CHECK-NEXT: add.s16 %rs3, %rs2, 1; 2685; CHECK-NEXT: add.s16 %rs4, %rs1, 1; 2686; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; 2687; CHECK-NEXT: st.local.u32 [%rd1], %r2; 2688; CHECK-NEXT: ret; 2689 %a.load = load <2 x i16>, ptr addrspace(5) %a 2690 %a.add = add <2 x i16> %a.load, <i16 1, i16 1> 2691 store <2 x i16> %a.add, ptr addrspace(5) %a 2692 ret void 2693} 2694 2695define void @local_4xi16(ptr addrspace(5) %a) { 2696; CHECK-LABEL: local_4xi16( 2697; CHECK: { 2698; CHECK-NEXT: .reg .b16 %rs<9>; 2699; CHECK-NEXT: .reg .b64 %rd<2>; 2700; CHECK-EMPTY: 2701; CHECK-NEXT: // %bb.0: 2702; CHECK-NEXT: ld.param.u64 %rd1, [local_4xi16_param_0]; 2703; CHECK-NEXT: ld.local.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; 2704; CHECK-NEXT: add.s16 %rs5, %rs4, 1; 2705; CHECK-NEXT: add.s16 %rs6, %rs3, 1; 2706; CHECK-NEXT: add.s16 %rs7, %rs2, 1; 2707; CHECK-NEXT: add.s16 %rs8, %rs1, 1; 2708; CHECK-NEXT: st.local.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; 2709; CHECK-NEXT: ret; 2710 %a.load = load <4 x i16>, ptr addrspace(5) %a 2711 %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1> 2712 store <4 x i16> %a.add, ptr addrspace(5) %a 2713 ret void 2714} 2715 2716define void @local_8xi16(ptr addrspace(5) %a) { 2717; CHECK-LABEL: local_8xi16( 2718; CHECK: { 2719; CHECK-NEXT: .reg .b16 %rs<17>; 2720; CHECK-NEXT: .reg .b32 %r<9>; 2721; CHECK-NEXT: .reg .b64 %rd<2>; 2722; CHECK-EMPTY: 2723; CHECK-NEXT: // %bb.0: 2724; CHECK-NEXT: ld.param.u64 %rd1, [local_8xi16_param_0]; 2725; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; 2726; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; 2727; CHECK-NEXT: add.s16 %rs3, %rs2, 1; 2728; CHECK-NEXT: add.s16 %rs4, %rs1, 1; 2729; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; 2730; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3; 2731; CHECK-NEXT: add.s16 %rs7, %rs6, 1; 2732; CHECK-NEXT: add.s16 %rs8, %rs5, 1; 2733; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7}; 2734; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2; 2735; CHECK-NEXT: add.s16 %rs11, %rs10, 1; 2736; CHECK-NEXT: add.s16 %rs12, %rs9, 1; 2737; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11}; 2738; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1; 2739; CHECK-NEXT: add.s16 %rs15, %rs14, 1; 2740; CHECK-NEXT: add.s16 %rs16, %rs13, 1; 2741; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15}; 2742; CHECK-NEXT: st.local.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; 2743; CHECK-NEXT: ret; 2744 %a.load = load <8 x i16>, ptr addrspace(5) %a 2745 %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 2746 store <8 x i16> %a.add, ptr addrspace(5) %a 2747 ret void 2748} 2749 2750define void @local_2xi32(ptr addrspace(5) %a) { 2751; CHECK-LABEL: local_2xi32( 2752; CHECK: { 2753; CHECK-NEXT: .reg .b32 %r<5>; 2754; CHECK-NEXT: .reg .b64 %rd<2>; 2755; CHECK-EMPTY: 2756; CHECK-NEXT: // %bb.0: 2757; CHECK-NEXT: ld.param.u64 %rd1, [local_2xi32_param_0]; 2758; CHECK-NEXT: ld.local.v2.u32 {%r1, %r2}, [%rd1]; 2759; CHECK-NEXT: add.s32 %r3, %r2, 1; 2760; CHECK-NEXT: add.s32 %r4, %r1, 1; 2761; CHECK-NEXT: st.local.v2.u32 [%rd1], {%r4, %r3}; 2762; CHECK-NEXT: ret; 2763 %a.load = load <2 x i32>, ptr addrspace(5) %a 2764 %a.add = add <2 x i32> %a.load, <i32 1, i32 1> 2765 store <2 x i32> %a.add, ptr addrspace(5) %a 2766 ret void 2767} 2768 2769define void @local_4xi32(ptr addrspace(5) %a) { 2770; CHECK-LABEL: local_4xi32( 2771; CHECK: { 2772; CHECK-NEXT: .reg .b32 %r<9>; 2773; CHECK-NEXT: .reg .b64 %rd<2>; 2774; CHECK-EMPTY: 2775; CHECK-NEXT: // %bb.0: 2776; CHECK-NEXT: ld.param.u64 %rd1, [local_4xi32_param_0]; 2777; CHECK-NEXT: ld.local.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; 2778; CHECK-NEXT: add.s32 %r5, %r4, 1; 2779; CHECK-NEXT: add.s32 %r6, %r3, 1; 2780; CHECK-NEXT: add.s32 %r7, %r2, 1; 2781; CHECK-NEXT: add.s32 %r8, %r1, 1; 2782; CHECK-NEXT: st.local.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; 2783; CHECK-NEXT: ret; 2784 %a.load = load <4 x i32>, ptr addrspace(5) %a 2785 %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1> 2786 store <4 x i32> %a.add, ptr addrspace(5) %a 2787 ret void 2788} 2789 2790define void @local_2xi64(ptr addrspace(5) %a) { 2791; CHECK-LABEL: local_2xi64( 2792; CHECK: { 2793; CHECK-NEXT: .reg .b64 %rd<6>; 2794; CHECK-EMPTY: 2795; CHECK-NEXT: // %bb.0: 2796; CHECK-NEXT: ld.param.u64 %rd1, [local_2xi64_param_0]; 2797; CHECK-NEXT: ld.local.v2.u64 {%rd2, %rd3}, [%rd1]; 2798; CHECK-NEXT: add.s64 %rd4, %rd3, 1; 2799; CHECK-NEXT: add.s64 %rd5, %rd2, 1; 2800; CHECK-NEXT: st.local.v2.u64 [%rd1], {%rd5, %rd4}; 2801; CHECK-NEXT: ret; 2802 %a.load = load <2 x i64>, ptr addrspace(5) %a 2803 %a.add = add <2 x i64> %a.load, <i64 1, i64 1> 2804 store <2 x i64> %a.add, ptr addrspace(5) %a 2805 ret void 2806} 2807 2808define void @local_2xfloat(ptr addrspace(5) %a) { 2809; CHECK-LABEL: local_2xfloat( 2810; CHECK: { 2811; CHECK-NEXT: .reg .f32 %f<5>; 2812; CHECK-NEXT: .reg .b64 %rd<2>; 2813; CHECK-EMPTY: 2814; CHECK-NEXT: // %bb.0: 2815; CHECK-NEXT: ld.param.u64 %rd1, [local_2xfloat_param_0]; 2816; CHECK-NEXT: ld.local.v2.f32 {%f1, %f2}, [%rd1]; 2817; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; 2818; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; 2819; CHECK-NEXT: st.local.v2.f32 [%rd1], {%f4, %f3}; 2820; CHECK-NEXT: ret; 2821 %a.load = load <2 x float>, ptr addrspace(5) %a 2822 %a.add = fadd <2 x float> %a.load, <float 1., float 1.> 2823 store <2 x float> %a.add, ptr addrspace(5) %a 2824 ret void 2825} 2826 2827define void @local_4xfloat(ptr addrspace(5) %a) { 2828; CHECK-LABEL: local_4xfloat( 2829; CHECK: { 2830; CHECK-NEXT: .reg .f32 %f<9>; 2831; CHECK-NEXT: .reg .b64 %rd<2>; 2832; CHECK-EMPTY: 2833; CHECK-NEXT: // %bb.0: 2834; CHECK-NEXT: ld.param.u64 %rd1, [local_4xfloat_param_0]; 2835; CHECK-NEXT: ld.local.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; 2836; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; 2837; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; 2838; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; 2839; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; 2840; CHECK-NEXT: st.local.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; 2841; CHECK-NEXT: ret; 2842 %a.load = load <4 x float>, ptr addrspace(5) %a 2843 %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.> 2844 store <4 x float> %a.add, ptr addrspace(5) %a 2845 ret void 2846} 2847 2848define void @local_2xdouble(ptr addrspace(5) %a) { 2849; CHECK-LABEL: local_2xdouble( 2850; CHECK: { 2851; CHECK-NEXT: .reg .b64 %rd<2>; 2852; CHECK-NEXT: .reg .f64 %fd<5>; 2853; CHECK-EMPTY: 2854; CHECK-NEXT: // %bb.0: 2855; CHECK-NEXT: ld.param.u64 %rd1, [local_2xdouble_param_0]; 2856; CHECK-NEXT: ld.local.v2.f64 {%fd1, %fd2}, [%rd1]; 2857; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; 2858; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; 2859; CHECK-NEXT: st.local.v2.f64 [%rd1], {%fd4, %fd3}; 2860; CHECK-NEXT: ret; 2861 %a.load = load <2 x double>, ptr addrspace(5) %a 2862 %a.add = fadd <2 x double> %a.load, <double 1., double 1.> 2863 store <2 x double> %a.add, ptr addrspace(5) %a 2864 ret void 2865} 2866 2867; local_volatile 2868 2869define void @local_volatile_2xi8(ptr addrspace(5) %a) { 2870; CHECK-LABEL: local_volatile_2xi8( 2871; CHECK: { 2872; CHECK-NEXT: .reg .b16 %rs<5>; 2873; CHECK-NEXT: .reg .b64 %rd<2>; 2874; CHECK-EMPTY: 2875; CHECK-NEXT: // %bb.0: 2876; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xi8_param_0]; 2877; CHECK-NEXT: ld.local.v2.u8 {%rs1, %rs2}, [%rd1]; 2878; CHECK-NEXT: add.s16 %rs3, %rs2, 1; 2879; CHECK-NEXT: add.s16 %rs4, %rs1, 1; 2880; CHECK-NEXT: st.local.v2.u8 [%rd1], {%rs4, %rs3}; 2881; CHECK-NEXT: ret; 2882 %a.load = load volatile <2 x i8>, ptr addrspace(5) %a 2883 %a.add = add <2 x i8> %a.load, <i8 1, i8 1> 2884 store volatile <2 x i8> %a.add, ptr addrspace(5) %a 2885 ret void 2886} 2887 2888define void @local_volatile_4xi8(ptr addrspace(5) %a) { 2889; CHECK-LABEL: local_volatile_4xi8( 2890; CHECK: { 2891; CHECK-NEXT: .reg .b16 %rs<9>; 2892; CHECK-NEXT: .reg .b32 %r<13>; 2893; CHECK-NEXT: .reg .b64 %rd<2>; 2894; CHECK-EMPTY: 2895; CHECK-NEXT: // %bb.0: 2896; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xi8_param_0]; 2897; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; 2898; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; 2899; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; 2900; CHECK-NEXT: add.s16 %rs2, %rs1, 1; 2901; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; 2902; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; 2903; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; 2904; CHECK-NEXT: add.s16 %rs4, %rs3, 1; 2905; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; 2906; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; 2907; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; 2908; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; 2909; CHECK-NEXT: add.s16 %rs6, %rs5, 1; 2910; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; 2911; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; 2912; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; 2913; CHECK-NEXT: add.s16 %rs8, %rs7, 1; 2914; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; 2915; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; 2916; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; 2917; CHECK-NEXT: st.local.u32 [%rd1], %r12; 2918; CHECK-NEXT: ret; 2919 %a.load = load volatile <4 x i8>, ptr addrspace(5) %a 2920 %a.add = add <4 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1> 2921 store volatile <4 x i8> %a.add, ptr addrspace(5) %a 2922 ret void 2923} 2924 2925define void @local_volatile_8xi8(ptr addrspace(5) %a) { 2926; CHECK-LABEL: local_volatile_8xi8( 2927; CHECK: { 2928; CHECK-NEXT: .reg .b16 %rs<17>; 2929; CHECK-NEXT: .reg .b32 %r<25>; 2930; CHECK-NEXT: .reg .b64 %rd<2>; 2931; CHECK-EMPTY: 2932; CHECK-NEXT: // %bb.0: 2933; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_8xi8_param_0]; 2934; CHECK-NEXT: ld.local.v2.b32 {%r1, %r2}, [%rd1]; 2935; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; 2936; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; 2937; CHECK-NEXT: add.s16 %rs2, %rs1, 1; 2938; CHECK-NEXT: cvt.u32.u16 %r4, %rs2; 2939; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; 2940; CHECK-NEXT: cvt.u16.u32 %rs3, %r5; 2941; CHECK-NEXT: add.s16 %rs4, %rs3, 1; 2942; CHECK-NEXT: cvt.u32.u16 %r6, %rs4; 2943; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U; 2944; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; 2945; CHECK-NEXT: cvt.u16.u32 %rs5, %r8; 2946; CHECK-NEXT: add.s16 %rs6, %rs5, 1; 2947; CHECK-NEXT: cvt.u32.u16 %r9, %rs6; 2948; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; 2949; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; 2950; CHECK-NEXT: add.s16 %rs8, %rs7, 1; 2951; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; 2952; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U; 2953; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U; 2954; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; 2955; CHECK-NEXT: cvt.u16.u32 %rs9, %r14; 2956; CHECK-NEXT: add.s16 %rs10, %rs9, 1; 2957; CHECK-NEXT: cvt.u32.u16 %r15, %rs10; 2958; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8; 2959; CHECK-NEXT: cvt.u16.u32 %rs11, %r16; 2960; CHECK-NEXT: add.s16 %rs12, %rs11, 1; 2961; CHECK-NEXT: cvt.u32.u16 %r17, %rs12; 2962; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U; 2963; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8; 2964; CHECK-NEXT: cvt.u16.u32 %rs13, %r19; 2965; CHECK-NEXT: add.s16 %rs14, %rs13, 1; 2966; CHECK-NEXT: cvt.u32.u16 %r20, %rs14; 2967; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8; 2968; CHECK-NEXT: cvt.u16.u32 %rs15, %r21; 2969; CHECK-NEXT: add.s16 %rs16, %rs15, 1; 2970; CHECK-NEXT: cvt.u32.u16 %r22, %rs16; 2971; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U; 2972; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U; 2973; CHECK-NEXT: st.local.v2.b32 [%rd1], {%r24, %r13}; 2974; CHECK-NEXT: ret; 2975 %a.load = load volatile <8 x i8>, ptr addrspace(5) %a 2976 %a.add = add <8 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 2977 store volatile <8 x i8> %a.add, ptr addrspace(5) %a 2978 ret void 2979} 2980 2981define void @local_volatile_16xi8(ptr addrspace(5) %a) { 2982; CHECK-LABEL: local_volatile_16xi8( 2983; CHECK: { 2984; CHECK-NEXT: .reg .b16 %rs<33>; 2985; CHECK-NEXT: .reg .b32 %r<49>; 2986; CHECK-NEXT: .reg .b64 %rd<2>; 2987; CHECK-EMPTY: 2988; CHECK-NEXT: // %bb.0: 2989; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_16xi8_param_0]; 2990; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; 2991; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; 2992; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; 2993; CHECK-NEXT: add.s16 %rs2, %rs1, 1; 2994; CHECK-NEXT: cvt.u32.u16 %r6, %rs2; 2995; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8; 2996; CHECK-NEXT: cvt.u16.u32 %rs3, %r7; 2997; CHECK-NEXT: add.s16 %rs4, %rs3, 1; 2998; CHECK-NEXT: cvt.u32.u16 %r8, %rs4; 2999; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U; 3000; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8; 3001; CHECK-NEXT: cvt.u16.u32 %rs5, %r10; 3002; CHECK-NEXT: add.s16 %rs6, %rs5, 1; 3003; CHECK-NEXT: cvt.u32.u16 %r11, %rs6; 3004; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8; 3005; CHECK-NEXT: cvt.u16.u32 %rs7, %r12; 3006; CHECK-NEXT: add.s16 %rs8, %rs7, 1; 3007; CHECK-NEXT: cvt.u32.u16 %r13, %rs8; 3008; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U; 3009; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U; 3010; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8; 3011; CHECK-NEXT: cvt.u16.u32 %rs9, %r16; 3012; CHECK-NEXT: add.s16 %rs10, %rs9, 1; 3013; CHECK-NEXT: cvt.u32.u16 %r17, %rs10; 3014; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8; 3015; CHECK-NEXT: cvt.u16.u32 %rs11, %r18; 3016; CHECK-NEXT: add.s16 %rs12, %rs11, 1; 3017; CHECK-NEXT: cvt.u32.u16 %r19, %rs12; 3018; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U; 3019; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8; 3020; CHECK-NEXT: cvt.u16.u32 %rs13, %r21; 3021; CHECK-NEXT: add.s16 %rs14, %rs13, 1; 3022; CHECK-NEXT: cvt.u32.u16 %r22, %rs14; 3023; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8; 3024; CHECK-NEXT: cvt.u16.u32 %rs15, %r23; 3025; CHECK-NEXT: add.s16 %rs16, %rs15, 1; 3026; CHECK-NEXT: cvt.u32.u16 %r24, %rs16; 3027; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U; 3028; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U; 3029; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8; 3030; CHECK-NEXT: cvt.u16.u32 %rs17, %r27; 3031; CHECK-NEXT: add.s16 %rs18, %rs17, 1; 3032; CHECK-NEXT: cvt.u32.u16 %r28, %rs18; 3033; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8; 3034; CHECK-NEXT: cvt.u16.u32 %rs19, %r29; 3035; CHECK-NEXT: add.s16 %rs20, %rs19, 1; 3036; CHECK-NEXT: cvt.u32.u16 %r30, %rs20; 3037; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U; 3038; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8; 3039; CHECK-NEXT: cvt.u16.u32 %rs21, %r32; 3040; CHECK-NEXT: add.s16 %rs22, %rs21, 1; 3041; CHECK-NEXT: cvt.u32.u16 %r33, %rs22; 3042; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8; 3043; CHECK-NEXT: cvt.u16.u32 %rs23, %r34; 3044; CHECK-NEXT: add.s16 %rs24, %rs23, 1; 3045; CHECK-NEXT: cvt.u32.u16 %r35, %rs24; 3046; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U; 3047; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U; 3048; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8; 3049; CHECK-NEXT: cvt.u16.u32 %rs25, %r38; 3050; CHECK-NEXT: add.s16 %rs26, %rs25, 1; 3051; CHECK-NEXT: cvt.u32.u16 %r39, %rs26; 3052; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8; 3053; CHECK-NEXT: cvt.u16.u32 %rs27, %r40; 3054; CHECK-NEXT: add.s16 %rs28, %rs27, 1; 3055; CHECK-NEXT: cvt.u32.u16 %r41, %rs28; 3056; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U; 3057; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8; 3058; CHECK-NEXT: cvt.u16.u32 %rs29, %r43; 3059; CHECK-NEXT: add.s16 %rs30, %rs29, 1; 3060; CHECK-NEXT: cvt.u32.u16 %r44, %rs30; 3061; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8; 3062; CHECK-NEXT: cvt.u16.u32 %rs31, %r45; 3063; CHECK-NEXT: add.s16 %rs32, %rs31, 1; 3064; CHECK-NEXT: cvt.u32.u16 %r46, %rs32; 3065; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U; 3066; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U; 3067; CHECK-NEXT: st.local.v4.b32 [%rd1], {%r48, %r37, %r26, %r15}; 3068; CHECK-NEXT: ret; 3069 %a.load = load volatile <16 x i8>, ptr addrspace(5) %a 3070 %a.add = add <16 x i8> %a.load, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1> 3071 store volatile <16 x i8> %a.add, ptr addrspace(5) %a 3072 ret void 3073} 3074 3075define void @local_volatile_2xi16(ptr addrspace(5) %a) { 3076; CHECK-LABEL: local_volatile_2xi16( 3077; CHECK: { 3078; CHECK-NEXT: .reg .b16 %rs<5>; 3079; CHECK-NEXT: .reg .b32 %r<3>; 3080; CHECK-NEXT: .reg .b64 %rd<2>; 3081; CHECK-EMPTY: 3082; CHECK-NEXT: // %bb.0: 3083; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xi16_param_0]; 3084; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; 3085; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; 3086; CHECK-NEXT: add.s16 %rs3, %rs2, 1; 3087; CHECK-NEXT: add.s16 %rs4, %rs1, 1; 3088; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; 3089; CHECK-NEXT: st.local.u32 [%rd1], %r2; 3090; CHECK-NEXT: ret; 3091 %a.load = load volatile <2 x i16>, ptr addrspace(5) %a 3092 %a.add = add <2 x i16> %a.load, <i16 1, i16 1> 3093 store volatile <2 x i16> %a.add, ptr addrspace(5) %a 3094 ret void 3095} 3096 3097define void @local_volatile_4xi16(ptr addrspace(5) %a) { 3098; CHECK-LABEL: local_volatile_4xi16( 3099; CHECK: { 3100; CHECK-NEXT: .reg .b16 %rs<9>; 3101; CHECK-NEXT: .reg .b64 %rd<2>; 3102; CHECK-EMPTY: 3103; CHECK-NEXT: // %bb.0: 3104; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xi16_param_0]; 3105; CHECK-NEXT: ld.local.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; 3106; CHECK-NEXT: add.s16 %rs5, %rs4, 1; 3107; CHECK-NEXT: add.s16 %rs6, %rs3, 1; 3108; CHECK-NEXT: add.s16 %rs7, %rs2, 1; 3109; CHECK-NEXT: add.s16 %rs8, %rs1, 1; 3110; CHECK-NEXT: st.local.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; 3111; CHECK-NEXT: ret; 3112 %a.load = load volatile <4 x i16>, ptr addrspace(5) %a 3113 %a.add = add <4 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1> 3114 store volatile <4 x i16> %a.add, ptr addrspace(5) %a 3115 ret void 3116} 3117 3118define void @local_volatile_8xi16(ptr addrspace(5) %a) { 3119; CHECK-LABEL: local_volatile_8xi16( 3120; CHECK: { 3121; CHECK-NEXT: .reg .b16 %rs<17>; 3122; CHECK-NEXT: .reg .b32 %r<9>; 3123; CHECK-NEXT: .reg .b64 %rd<2>; 3124; CHECK-EMPTY: 3125; CHECK-NEXT: // %bb.0: 3126; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_8xi16_param_0]; 3127; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; 3128; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; 3129; CHECK-NEXT: add.s16 %rs3, %rs2, 1; 3130; CHECK-NEXT: add.s16 %rs4, %rs1, 1; 3131; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; 3132; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3; 3133; CHECK-NEXT: add.s16 %rs7, %rs6, 1; 3134; CHECK-NEXT: add.s16 %rs8, %rs5, 1; 3135; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7}; 3136; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2; 3137; CHECK-NEXT: add.s16 %rs11, %rs10, 1; 3138; CHECK-NEXT: add.s16 %rs12, %rs9, 1; 3139; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11}; 3140; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1; 3141; CHECK-NEXT: add.s16 %rs15, %rs14, 1; 3142; CHECK-NEXT: add.s16 %rs16, %rs13, 1; 3143; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15}; 3144; CHECK-NEXT: st.local.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; 3145; CHECK-NEXT: ret; 3146 %a.load = load volatile <8 x i16>, ptr addrspace(5) %a 3147 %a.add = add <8 x i16> %a.load, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1> 3148 store volatile <8 x i16> %a.add, ptr addrspace(5) %a 3149 ret void 3150} 3151 3152define void @local_volatile_2xi32(ptr addrspace(5) %a) { 3153; CHECK-LABEL: local_volatile_2xi32( 3154; CHECK: { 3155; CHECK-NEXT: .reg .b32 %r<5>; 3156; CHECK-NEXT: .reg .b64 %rd<2>; 3157; CHECK-EMPTY: 3158; CHECK-NEXT: // %bb.0: 3159; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xi32_param_0]; 3160; CHECK-NEXT: ld.local.v2.u32 {%r1, %r2}, [%rd1]; 3161; CHECK-NEXT: add.s32 %r3, %r2, 1; 3162; CHECK-NEXT: add.s32 %r4, %r1, 1; 3163; CHECK-NEXT: st.local.v2.u32 [%rd1], {%r4, %r3}; 3164; CHECK-NEXT: ret; 3165 %a.load = load volatile <2 x i32>, ptr addrspace(5) %a 3166 %a.add = add <2 x i32> %a.load, <i32 1, i32 1> 3167 store volatile <2 x i32> %a.add, ptr addrspace(5) %a 3168 ret void 3169} 3170 3171define void @local_volatile_4xi32(ptr addrspace(5) %a) { 3172; CHECK-LABEL: local_volatile_4xi32( 3173; CHECK: { 3174; CHECK-NEXT: .reg .b32 %r<9>; 3175; CHECK-NEXT: .reg .b64 %rd<2>; 3176; CHECK-EMPTY: 3177; CHECK-NEXT: // %bb.0: 3178; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xi32_param_0]; 3179; CHECK-NEXT: ld.local.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; 3180; CHECK-NEXT: add.s32 %r5, %r4, 1; 3181; CHECK-NEXT: add.s32 %r6, %r3, 1; 3182; CHECK-NEXT: add.s32 %r7, %r2, 1; 3183; CHECK-NEXT: add.s32 %r8, %r1, 1; 3184; CHECK-NEXT: st.local.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; 3185; CHECK-NEXT: ret; 3186 %a.load = load volatile <4 x i32>, ptr addrspace(5) %a 3187 %a.add = add <4 x i32> %a.load, <i32 1, i32 1, i32 1, i32 1> 3188 store volatile <4 x i32> %a.add, ptr addrspace(5) %a 3189 ret void 3190} 3191 3192define void @local_volatile_2xi64(ptr addrspace(5) %a) { 3193; CHECK-LABEL: local_volatile_2xi64( 3194; CHECK: { 3195; CHECK-NEXT: .reg .b64 %rd<6>; 3196; CHECK-EMPTY: 3197; CHECK-NEXT: // %bb.0: 3198; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xi64_param_0]; 3199; CHECK-NEXT: ld.local.v2.u64 {%rd2, %rd3}, [%rd1]; 3200; CHECK-NEXT: add.s64 %rd4, %rd3, 1; 3201; CHECK-NEXT: add.s64 %rd5, %rd2, 1; 3202; CHECK-NEXT: st.local.v2.u64 [%rd1], {%rd5, %rd4}; 3203; CHECK-NEXT: ret; 3204 %a.load = load volatile <2 x i64>, ptr addrspace(5) %a 3205 %a.add = add <2 x i64> %a.load, <i64 1, i64 1> 3206 store volatile <2 x i64> %a.add, ptr addrspace(5) %a 3207 ret void 3208} 3209 3210define void @local_volatile_2xfloat(ptr addrspace(5) %a) { 3211; CHECK-LABEL: local_volatile_2xfloat( 3212; CHECK: { 3213; CHECK-NEXT: .reg .f32 %f<5>; 3214; CHECK-NEXT: .reg .b64 %rd<2>; 3215; CHECK-EMPTY: 3216; CHECK-NEXT: // %bb.0: 3217; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xfloat_param_0]; 3218; CHECK-NEXT: ld.local.v2.f32 {%f1, %f2}, [%rd1]; 3219; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; 3220; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; 3221; CHECK-NEXT: st.local.v2.f32 [%rd1], {%f4, %f3}; 3222; CHECK-NEXT: ret; 3223 %a.load = load volatile <2 x float>, ptr addrspace(5) %a 3224 %a.add = fadd <2 x float> %a.load, <float 1., float 1.> 3225 store volatile <2 x float> %a.add, ptr addrspace(5) %a 3226 ret void 3227} 3228 3229define void @local_volatile_4xfloat(ptr addrspace(5) %a) { 3230; CHECK-LABEL: local_volatile_4xfloat( 3231; CHECK: { 3232; CHECK-NEXT: .reg .f32 %f<9>; 3233; CHECK-NEXT: .reg .b64 %rd<2>; 3234; CHECK-EMPTY: 3235; CHECK-NEXT: // %bb.0: 3236; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xfloat_param_0]; 3237; CHECK-NEXT: ld.local.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; 3238; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; 3239; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; 3240; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; 3241; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; 3242; CHECK-NEXT: st.local.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; 3243; CHECK-NEXT: ret; 3244 %a.load = load volatile <4 x float>, ptr addrspace(5) %a 3245 %a.add = fadd <4 x float> %a.load, <float 1., float 1., float 1., float 1.> 3246 store volatile <4 x float> %a.add, ptr addrspace(5) %a 3247 ret void 3248} 3249 3250define void @local_volatile_2xdouble(ptr addrspace(5) %a) { 3251; CHECK-LABEL: local_volatile_2xdouble( 3252; CHECK: { 3253; CHECK-NEXT: .reg .b64 %rd<2>; 3254; CHECK-NEXT: .reg .f64 %fd<5>; 3255; CHECK-EMPTY: 3256; CHECK-NEXT: // %bb.0: 3257; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xdouble_param_0]; 3258; CHECK-NEXT: ld.local.v2.f64 {%fd1, %fd2}, [%rd1]; 3259; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; 3260; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; 3261; CHECK-NEXT: st.local.v2.f64 [%rd1], {%fd4, %fd3}; 3262; CHECK-NEXT: ret; 3263 %a.load = load volatile <2 x double>, ptr addrspace(5) %a 3264 %a.add = fadd <2 x double> %a.load, <double 1., double 1.> 3265 store volatile <2 x double> %a.add, ptr addrspace(5) %a 3266 ret void 3267} 3268