1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6 %s 3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s 4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s 5; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s 6; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s 7 8define i8 @v_lshr_i8(i8 %value, i8 %amount) { 9; GFX6-LABEL: v_lshr_i8: 10; GFX6: ; %bb.0: 11; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12; GFX6-NEXT: v_and_b32_e32 v1, 0xff, v1 13; GFX6-NEXT: v_and_b32_e32 v0, 0xff, v0 14; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v0 15; GFX6-NEXT: s_setpc_b64 s[30:31] 16; 17; GFX8-LABEL: v_lshr_i8: 18; GFX8: ; %bb.0: 19; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 20; GFX8-NEXT: v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 21; GFX8-NEXT: s_setpc_b64 s[30:31] 22; 23; GFX9-LABEL: v_lshr_i8: 24; GFX9: ; %bb.0: 25; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 26; GFX9-NEXT: v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 27; GFX9-NEXT: s_setpc_b64 s[30:31] 28; 29; GFX10PLUS-LABEL: v_lshr_i8: 30; GFX10PLUS: ; %bb.0: 31; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 32; GFX10PLUS-NEXT: v_and_b32_e32 v1, 0xff, v1 33; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xff, v0 34; GFX10PLUS-NEXT: v_lshrrev_b16 v0, v1, v0 35; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 36 %result = lshr i8 %value, %amount 37 ret i8 %result 38} 39 40define i8 @v_lshr_i8_7(i8 %value) { 41; GFX6-LABEL: v_lshr_i8_7: 42; GFX6: ; %bb.0: 43; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 44; GFX6-NEXT: v_bfe_u32 v0, v0, 7, 1 45; GFX6-NEXT: s_setpc_b64 s[30:31] 46; 47; GFX8-LABEL: v_lshr_i8_7: 48; GFX8: ; %bb.0: 49; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 50; GFX8-NEXT: v_mov_b32_e32 v1, 7 51; GFX8-NEXT: v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 52; GFX8-NEXT: s_setpc_b64 s[30:31] 53; 54; GFX9-LABEL: v_lshr_i8_7: 55; GFX9: ; %bb.0: 56; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 57; GFX9-NEXT: v_mov_b32_e32 v1, 7 58; GFX9-NEXT: v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 59; GFX9-NEXT: s_setpc_b64 s[30:31] 60; 61; GFX10PLUS-LABEL: v_lshr_i8_7: 62; GFX10PLUS: ; %bb.0: 63; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 64; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xff, v0 65; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 7, v0 66; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 67 %result = lshr i8 %value, 7 68 ret i8 %result 69} 70 71define amdgpu_ps i8 @s_lshr_i8(i8 inreg %value, i8 inreg %amount) { 72; GCN-LABEL: s_lshr_i8: 73; GCN: ; %bb.0: 74; GCN-NEXT: s_and_b32 s0, s0, 0xff 75; GCN-NEXT: s_lshr_b32 s0, s0, s1 76; GCN-NEXT: ; return to shader part epilog 77; 78; GFX10PLUS-LABEL: s_lshr_i8: 79; GFX10PLUS: ; %bb.0: 80; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xff 81; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s1 82; GFX10PLUS-NEXT: ; return to shader part epilog 83 %result = lshr i8 %value, %amount 84 ret i8 %result 85} 86 87define amdgpu_ps i8 @s_lshr_i8_7(i8 inreg %value) { 88; GCN-LABEL: s_lshr_i8_7: 89; GCN: ; %bb.0: 90; GCN-NEXT: s_bfe_u32 s0, s0, 0x10007 91; GCN-NEXT: ; return to shader part epilog 92; 93; GFX10PLUS-LABEL: s_lshr_i8_7: 94; GFX10PLUS: ; %bb.0: 95; GFX10PLUS-NEXT: s_bfe_u32 s0, s0, 0x10007 96; GFX10PLUS-NEXT: ; return to shader part epilog 97 %result = lshr i8 %value, 7 98 ret i8 %result 99} 100 101 102define i24 @v_lshr_i24(i24 %value, i24 %amount) { 103; GCN-LABEL: v_lshr_i24: 104; GCN: ; %bb.0: 105; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 106; GCN-NEXT: v_and_b32_e32 v1, 0xffffff, v1 107; GCN-NEXT: v_and_b32_e32 v0, 0xffffff, v0 108; GCN-NEXT: v_lshrrev_b32_e32 v0, v1, v0 109; GCN-NEXT: s_setpc_b64 s[30:31] 110; 111; GFX10PLUS-LABEL: v_lshr_i24: 112; GFX10PLUS: ; %bb.0: 113; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 114; GFX10PLUS-NEXT: v_and_b32_e32 v1, 0xffffff, v1 115; GFX10PLUS-NEXT: v_and_b32_e32 v0, 0xffffff, v0 116; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, v1, v0 117; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 118 %result = lshr i24 %value, %amount 119 ret i24 %result 120} 121 122define i24 @v_lshr_i24_7(i24 %value) { 123; GCN-LABEL: v_lshr_i24_7: 124; GCN: ; %bb.0: 125; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 126; GCN-NEXT: v_bfe_u32 v0, v0, 7, 17 127; GCN-NEXT: s_setpc_b64 s[30:31] 128; 129; GFX10PLUS-LABEL: v_lshr_i24_7: 130; GFX10PLUS: ; %bb.0: 131; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 132; GFX10PLUS-NEXT: v_bfe_u32 v0, v0, 7, 17 133; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 134 %result = lshr i24 %value, 7 135 ret i24 %result 136} 137 138define amdgpu_ps i24 @s_lshr_i24(i24 inreg %value, i24 inreg %amount) { 139; GCN-LABEL: s_lshr_i24: 140; GCN: ; %bb.0: 141; GCN-NEXT: s_and_b32 s0, s0, 0xffffff 142; GCN-NEXT: s_lshr_b32 s0, s0, s1 143; GCN-NEXT: ; return to shader part epilog 144; 145; GFX10PLUS-LABEL: s_lshr_i24: 146; GFX10PLUS: ; %bb.0: 147; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffffff 148; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s1 149; GFX10PLUS-NEXT: ; return to shader part epilog 150 %result = lshr i24 %value, %amount 151 ret i24 %result 152} 153 154define amdgpu_ps i24 @s_lshr_i24_7(i24 inreg %value) { 155; GCN-LABEL: s_lshr_i24_7: 156; GCN: ; %bb.0: 157; GCN-NEXT: s_bfe_u32 s0, s0, 0x110007 158; GCN-NEXT: ; return to shader part epilog 159; 160; GFX10PLUS-LABEL: s_lshr_i24_7: 161; GFX10PLUS: ; %bb.0: 162; GFX10PLUS-NEXT: s_bfe_u32 s0, s0, 0x110007 163; GFX10PLUS-NEXT: ; return to shader part epilog 164 %result = lshr i24 %value, 7 165 ret i24 %result 166} 167 168define i32 @v_lshr_i32(i32 %value, i32 %amount) { 169; GCN-LABEL: v_lshr_i32: 170; GCN: ; %bb.0: 171; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 172; GCN-NEXT: v_lshrrev_b32_e32 v0, v1, v0 173; GCN-NEXT: s_setpc_b64 s[30:31] 174; 175; GFX10PLUS-LABEL: v_lshr_i32: 176; GFX10PLUS: ; %bb.0: 177; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 178; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, v1, v0 179; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 180 %result = lshr i32 %value, %amount 181 ret i32 %result 182} 183 184define i32 @v_lshr_i32_31(i32 %value) { 185; GCN-LABEL: v_lshr_i32_31: 186; GCN: ; %bb.0: 187; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 188; GCN-NEXT: v_lshrrev_b32_e32 v0, 31, v0 189; GCN-NEXT: s_setpc_b64 s[30:31] 190; 191; GFX10PLUS-LABEL: v_lshr_i32_31: 192; GFX10PLUS: ; %bb.0: 193; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 194; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, 31, v0 195; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 196 %result = lshr i32 %value, 31 197 ret i32 %result 198} 199 200define amdgpu_ps i32 @s_lshr_i32(i32 inreg %value, i32 inreg %amount) { 201; GCN-LABEL: s_lshr_i32: 202; GCN: ; %bb.0: 203; GCN-NEXT: s_lshr_b32 s0, s0, s1 204; GCN-NEXT: ; return to shader part epilog 205; 206; GFX10PLUS-LABEL: s_lshr_i32: 207; GFX10PLUS: ; %bb.0: 208; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s1 209; GFX10PLUS-NEXT: ; return to shader part epilog 210 %result = lshr i32 %value, %amount 211 ret i32 %result 212} 213 214define amdgpu_ps i32 @s_lshr_i32_31(i32 inreg %value) { 215; GCN-LABEL: s_lshr_i32_31: 216; GCN: ; %bb.0: 217; GCN-NEXT: s_lshr_b32 s0, s0, 31 218; GCN-NEXT: ; return to shader part epilog 219; 220; GFX10PLUS-LABEL: s_lshr_i32_31: 221; GFX10PLUS: ; %bb.0: 222; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, 31 223; GFX10PLUS-NEXT: ; return to shader part epilog 224 %result = lshr i32 %value, 31 225 ret i32 %result 226} 227 228define amdgpu_ps float @lshr_i32_sv(i32 inreg %value, i32 %amount) { 229; GFX6-LABEL: lshr_i32_sv: 230; GFX6: ; %bb.0: 231; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 232; GFX6-NEXT: ; return to shader part epilog 233; 234; GFX8-LABEL: lshr_i32_sv: 235; GFX8: ; %bb.0: 236; GFX8-NEXT: v_lshrrev_b32_e64 v0, v0, s0 237; GFX8-NEXT: ; return to shader part epilog 238; 239; GFX9-LABEL: lshr_i32_sv: 240; GFX9: ; %bb.0: 241; GFX9-NEXT: v_lshrrev_b32_e64 v0, v0, s0 242; GFX9-NEXT: ; return to shader part epilog 243; 244; GFX10PLUS-LABEL: lshr_i32_sv: 245; GFX10PLUS: ; %bb.0: 246; GFX10PLUS-NEXT: v_lshrrev_b32_e64 v0, v0, s0 247; GFX10PLUS-NEXT: ; return to shader part epilog 248 %result = lshr i32 %value, %amount 249 %cast = bitcast i32 %result to float 250 ret float %cast 251} 252 253define amdgpu_ps float @lshr_i32_vs(i32 %value, i32 inreg %amount) { 254; GCN-LABEL: lshr_i32_vs: 255; GCN: ; %bb.0: 256; GCN-NEXT: v_lshrrev_b32_e32 v0, s0, v0 257; GCN-NEXT: ; return to shader part epilog 258; 259; GFX10PLUS-LABEL: lshr_i32_vs: 260; GFX10PLUS: ; %bb.0: 261; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, s0, v0 262; GFX10PLUS-NEXT: ; return to shader part epilog 263 %result = lshr i32 %value, %amount 264 %cast = bitcast i32 %result to float 265 ret float %cast 266} 267 268define <2 x i32> @v_lshr_v2i32(<2 x i32> %value, <2 x i32> %amount) { 269; GCN-LABEL: v_lshr_v2i32: 270; GCN: ; %bb.0: 271; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 272; GCN-NEXT: v_lshrrev_b32_e32 v0, v2, v0 273; GCN-NEXT: v_lshrrev_b32_e32 v1, v3, v1 274; GCN-NEXT: s_setpc_b64 s[30:31] 275; 276; GFX10PLUS-LABEL: v_lshr_v2i32: 277; GFX10PLUS: ; %bb.0: 278; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 279; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, v2, v0 280; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v1, v3, v1 281; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 282 %result = lshr <2 x i32> %value, %amount 283 ret <2 x i32> %result 284} 285 286define <2 x i32> @v_lshr_v2i32_31(<2 x i32> %value) { 287; GCN-LABEL: v_lshr_v2i32_31: 288; GCN: ; %bb.0: 289; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 290; GCN-NEXT: v_lshrrev_b32_e32 v0, 31, v0 291; GCN-NEXT: v_lshrrev_b32_e32 v1, 31, v1 292; GCN-NEXT: s_setpc_b64 s[30:31] 293; 294; GFX10PLUS-LABEL: v_lshr_v2i32_31: 295; GFX10PLUS: ; %bb.0: 296; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 297; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, 31, v0 298; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v1, 31, v1 299; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 300 %result = lshr <2 x i32> %value, <i32 31, i32 31> 301 ret <2 x i32> %result 302} 303 304define amdgpu_ps <2 x i32> @s_lshr_v2i32(<2 x i32> inreg %value, <2 x i32> inreg %amount) { 305; GCN-LABEL: s_lshr_v2i32: 306; GCN: ; %bb.0: 307; GCN-NEXT: s_lshr_b32 s0, s0, s2 308; GCN-NEXT: s_lshr_b32 s1, s1, s3 309; GCN-NEXT: ; return to shader part epilog 310; 311; GFX10PLUS-LABEL: s_lshr_v2i32: 312; GFX10PLUS: ; %bb.0: 313; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s2 314; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, s3 315; GFX10PLUS-NEXT: ; return to shader part epilog 316 %result = lshr <2 x i32> %value, %amount 317 ret <2 x i32> %result 318} 319 320define <3 x i32> @v_lshr_v3i32(<3 x i32> %value, <3 x i32> %amount) { 321; GCN-LABEL: v_lshr_v3i32: 322; GCN: ; %bb.0: 323; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 324; GCN-NEXT: v_lshrrev_b32_e32 v0, v3, v0 325; GCN-NEXT: v_lshrrev_b32_e32 v1, v4, v1 326; GCN-NEXT: v_lshrrev_b32_e32 v2, v5, v2 327; GCN-NEXT: s_setpc_b64 s[30:31] 328; 329; GFX10PLUS-LABEL: v_lshr_v3i32: 330; GFX10PLUS: ; %bb.0: 331; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 332; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, v3, v0 333; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v1, v4, v1 334; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v2, v5, v2 335; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 336 %result = lshr <3 x i32> %value, %amount 337 ret <3 x i32> %result 338} 339 340define amdgpu_ps <3 x i32> @s_lshr_v3i32(<3 x i32> inreg %value, <3 x i32> inreg %amount) { 341; GCN-LABEL: s_lshr_v3i32: 342; GCN: ; %bb.0: 343; GCN-NEXT: s_lshr_b32 s0, s0, s3 344; GCN-NEXT: s_lshr_b32 s1, s1, s4 345; GCN-NEXT: s_lshr_b32 s2, s2, s5 346; GCN-NEXT: ; return to shader part epilog 347; 348; GFX10PLUS-LABEL: s_lshr_v3i32: 349; GFX10PLUS: ; %bb.0: 350; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s3 351; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, s4 352; GFX10PLUS-NEXT: s_lshr_b32 s2, s2, s5 353; GFX10PLUS-NEXT: ; return to shader part epilog 354 %result = lshr <3 x i32> %value, %amount 355 ret <3 x i32> %result 356} 357 358define <4 x i32> @v_lshr_v4i32(<4 x i32> %value, <4 x i32> %amount) { 359; GCN-LABEL: v_lshr_v4i32: 360; GCN: ; %bb.0: 361; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 362; GCN-NEXT: v_lshrrev_b32_e32 v0, v4, v0 363; GCN-NEXT: v_lshrrev_b32_e32 v1, v5, v1 364; GCN-NEXT: v_lshrrev_b32_e32 v2, v6, v2 365; GCN-NEXT: v_lshrrev_b32_e32 v3, v7, v3 366; GCN-NEXT: s_setpc_b64 s[30:31] 367; 368; GFX10PLUS-LABEL: v_lshr_v4i32: 369; GFX10PLUS: ; %bb.0: 370; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 371; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, v4, v0 372; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v1, v5, v1 373; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v2, v6, v2 374; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v3, v7, v3 375; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 376 %result = lshr <4 x i32> %value, %amount 377 ret <4 x i32> %result 378} 379 380define amdgpu_ps <4 x i32> @s_lshr_v4i32(<4 x i32> inreg %value, <4 x i32> inreg %amount) { 381; GCN-LABEL: s_lshr_v4i32: 382; GCN: ; %bb.0: 383; GCN-NEXT: s_lshr_b32 s0, s0, s4 384; GCN-NEXT: s_lshr_b32 s1, s1, s5 385; GCN-NEXT: s_lshr_b32 s2, s2, s6 386; GCN-NEXT: s_lshr_b32 s3, s3, s7 387; GCN-NEXT: ; return to shader part epilog 388; 389; GFX10PLUS-LABEL: s_lshr_v4i32: 390; GFX10PLUS: ; %bb.0: 391; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s4 392; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, s5 393; GFX10PLUS-NEXT: s_lshr_b32 s2, s2, s6 394; GFX10PLUS-NEXT: s_lshr_b32 s3, s3, s7 395; GFX10PLUS-NEXT: ; return to shader part epilog 396 %result = lshr <4 x i32> %value, %amount 397 ret <4 x i32> %result 398} 399 400define <5 x i32> @v_lshr_v5i32(<5 x i32> %value, <5 x i32> %amount) { 401; GCN-LABEL: v_lshr_v5i32: 402; GCN: ; %bb.0: 403; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 404; GCN-NEXT: v_lshrrev_b32_e32 v0, v5, v0 405; GCN-NEXT: v_lshrrev_b32_e32 v1, v6, v1 406; GCN-NEXT: v_lshrrev_b32_e32 v2, v7, v2 407; GCN-NEXT: v_lshrrev_b32_e32 v3, v8, v3 408; GCN-NEXT: v_lshrrev_b32_e32 v4, v9, v4 409; GCN-NEXT: s_setpc_b64 s[30:31] 410; 411; GFX10PLUS-LABEL: v_lshr_v5i32: 412; GFX10PLUS: ; %bb.0: 413; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 414; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, v5, v0 415; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v1, v6, v1 416; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v2, v7, v2 417; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v3, v8, v3 418; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v4, v9, v4 419; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 420 %result = lshr <5 x i32> %value, %amount 421 ret <5 x i32> %result 422} 423 424define amdgpu_ps <5 x i32> @s_lshr_v5i32(<5 x i32> inreg %value, <5 x i32> inreg %amount) { 425; GCN-LABEL: s_lshr_v5i32: 426; GCN: ; %bb.0: 427; GCN-NEXT: s_lshr_b32 s0, s0, s5 428; GCN-NEXT: s_lshr_b32 s1, s1, s6 429; GCN-NEXT: s_lshr_b32 s2, s2, s7 430; GCN-NEXT: s_lshr_b32 s3, s3, s8 431; GCN-NEXT: s_lshr_b32 s4, s4, s9 432; GCN-NEXT: ; return to shader part epilog 433; 434; GFX10PLUS-LABEL: s_lshr_v5i32: 435; GFX10PLUS: ; %bb.0: 436; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s5 437; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, s6 438; GFX10PLUS-NEXT: s_lshr_b32 s2, s2, s7 439; GFX10PLUS-NEXT: s_lshr_b32 s3, s3, s8 440; GFX10PLUS-NEXT: s_lshr_b32 s4, s4, s9 441; GFX10PLUS-NEXT: ; return to shader part epilog 442 %result = lshr <5 x i32> %value, %amount 443 ret <5 x i32> %result 444} 445 446define <16 x i32> @v_lshr_v16i32(<16 x i32> %value, <16 x i32> %amount) { 447; GCN-LABEL: v_lshr_v16i32: 448; GCN: ; %bb.0: 449; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 450; GCN-NEXT: v_lshrrev_b32_e32 v0, v16, v0 451; GCN-NEXT: buffer_load_dword v16, off, s[0:3], s32 452; GCN-NEXT: v_lshrrev_b32_e32 v1, v17, v1 453; GCN-NEXT: v_lshrrev_b32_e32 v2, v18, v2 454; GCN-NEXT: v_lshrrev_b32_e32 v3, v19, v3 455; GCN-NEXT: v_lshrrev_b32_e32 v4, v20, v4 456; GCN-NEXT: v_lshrrev_b32_e32 v5, v21, v5 457; GCN-NEXT: v_lshrrev_b32_e32 v6, v22, v6 458; GCN-NEXT: v_lshrrev_b32_e32 v7, v23, v7 459; GCN-NEXT: v_lshrrev_b32_e32 v8, v24, v8 460; GCN-NEXT: v_lshrrev_b32_e32 v9, v25, v9 461; GCN-NEXT: v_lshrrev_b32_e32 v10, v26, v10 462; GCN-NEXT: v_lshrrev_b32_e32 v11, v27, v11 463; GCN-NEXT: v_lshrrev_b32_e32 v12, v28, v12 464; GCN-NEXT: v_lshrrev_b32_e32 v13, v29, v13 465; GCN-NEXT: v_lshrrev_b32_e32 v14, v30, v14 466; GCN-NEXT: s_waitcnt vmcnt(0) 467; GCN-NEXT: v_lshrrev_b32_e32 v15, v16, v15 468; GCN-NEXT: s_setpc_b64 s[30:31] 469; 470; GFX10-LABEL: v_lshr_v16i32: 471; GFX10: ; %bb.0: 472; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 473; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 474; GFX10-NEXT: v_lshrrev_b32_e32 v0, v16, v0 475; GFX10-NEXT: v_lshrrev_b32_e32 v1, v17, v1 476; GFX10-NEXT: v_lshrrev_b32_e32 v2, v18, v2 477; GFX10-NEXT: v_lshrrev_b32_e32 v3, v19, v3 478; GFX10-NEXT: v_lshrrev_b32_e32 v4, v20, v4 479; GFX10-NEXT: v_lshrrev_b32_e32 v5, v21, v5 480; GFX10-NEXT: v_lshrrev_b32_e32 v6, v22, v6 481; GFX10-NEXT: v_lshrrev_b32_e32 v7, v23, v7 482; GFX10-NEXT: v_lshrrev_b32_e32 v8, v24, v8 483; GFX10-NEXT: v_lshrrev_b32_e32 v9, v25, v9 484; GFX10-NEXT: v_lshrrev_b32_e32 v10, v26, v10 485; GFX10-NEXT: v_lshrrev_b32_e32 v11, v27, v11 486; GFX10-NEXT: v_lshrrev_b32_e32 v12, v28, v12 487; GFX10-NEXT: v_lshrrev_b32_e32 v13, v29, v13 488; GFX10-NEXT: v_lshrrev_b32_e32 v14, v30, v14 489; GFX10-NEXT: s_waitcnt vmcnt(0) 490; GFX10-NEXT: v_lshrrev_b32_e32 v15, v31, v15 491; GFX10-NEXT: s_setpc_b64 s[30:31] 492; 493; GFX11-LABEL: v_lshr_v16i32: 494; GFX11: ; %bb.0: 495; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 496; GFX11-NEXT: scratch_load_b32 v31, off, s32 497; GFX11-NEXT: v_lshrrev_b32_e32 v0, v16, v0 498; GFX11-NEXT: v_lshrrev_b32_e32 v1, v17, v1 499; GFX11-NEXT: v_lshrrev_b32_e32 v2, v18, v2 500; GFX11-NEXT: v_lshrrev_b32_e32 v3, v19, v3 501; GFX11-NEXT: v_lshrrev_b32_e32 v4, v20, v4 502; GFX11-NEXT: v_lshrrev_b32_e32 v5, v21, v5 503; GFX11-NEXT: v_lshrrev_b32_e32 v6, v22, v6 504; GFX11-NEXT: v_lshrrev_b32_e32 v7, v23, v7 505; GFX11-NEXT: v_lshrrev_b32_e32 v8, v24, v8 506; GFX11-NEXT: v_lshrrev_b32_e32 v9, v25, v9 507; GFX11-NEXT: v_lshrrev_b32_e32 v10, v26, v10 508; GFX11-NEXT: v_lshrrev_b32_e32 v11, v27, v11 509; GFX11-NEXT: v_lshrrev_b32_e32 v12, v28, v12 510; GFX11-NEXT: v_lshrrev_b32_e32 v13, v29, v13 511; GFX11-NEXT: v_lshrrev_b32_e32 v14, v30, v14 512; GFX11-NEXT: s_waitcnt vmcnt(0) 513; GFX11-NEXT: v_lshrrev_b32_e32 v15, v31, v15 514; GFX11-NEXT: s_setpc_b64 s[30:31] 515 %result = lshr <16 x i32> %value, %amount 516 ret <16 x i32> %result 517} 518 519define amdgpu_ps <16 x i32> @s_lshr_v16i32(<16 x i32> inreg %value, <16 x i32> inreg %amount) { 520; GCN-LABEL: s_lshr_v16i32: 521; GCN: ; %bb.0: 522; GCN-NEXT: s_lshr_b32 s0, s0, s16 523; GCN-NEXT: s_lshr_b32 s1, s1, s17 524; GCN-NEXT: s_lshr_b32 s2, s2, s18 525; GCN-NEXT: s_lshr_b32 s3, s3, s19 526; GCN-NEXT: s_lshr_b32 s4, s4, s20 527; GCN-NEXT: s_lshr_b32 s5, s5, s21 528; GCN-NEXT: s_lshr_b32 s6, s6, s22 529; GCN-NEXT: s_lshr_b32 s7, s7, s23 530; GCN-NEXT: s_lshr_b32 s8, s8, s24 531; GCN-NEXT: s_lshr_b32 s9, s9, s25 532; GCN-NEXT: s_lshr_b32 s10, s10, s26 533; GCN-NEXT: s_lshr_b32 s11, s11, s27 534; GCN-NEXT: s_lshr_b32 s12, s12, s28 535; GCN-NEXT: s_lshr_b32 s13, s13, s29 536; GCN-NEXT: s_lshr_b32 s14, s14, s30 537; GCN-NEXT: s_lshr_b32 s15, s15, s31 538; GCN-NEXT: ; return to shader part epilog 539; 540; GFX10PLUS-LABEL: s_lshr_v16i32: 541; GFX10PLUS: ; %bb.0: 542; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s16 543; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, s17 544; GFX10PLUS-NEXT: s_lshr_b32 s2, s2, s18 545; GFX10PLUS-NEXT: s_lshr_b32 s3, s3, s19 546; GFX10PLUS-NEXT: s_lshr_b32 s4, s4, s20 547; GFX10PLUS-NEXT: s_lshr_b32 s5, s5, s21 548; GFX10PLUS-NEXT: s_lshr_b32 s6, s6, s22 549; GFX10PLUS-NEXT: s_lshr_b32 s7, s7, s23 550; GFX10PLUS-NEXT: s_lshr_b32 s8, s8, s24 551; GFX10PLUS-NEXT: s_lshr_b32 s9, s9, s25 552; GFX10PLUS-NEXT: s_lshr_b32 s10, s10, s26 553; GFX10PLUS-NEXT: s_lshr_b32 s11, s11, s27 554; GFX10PLUS-NEXT: s_lshr_b32 s12, s12, s28 555; GFX10PLUS-NEXT: s_lshr_b32 s13, s13, s29 556; GFX10PLUS-NEXT: s_lshr_b32 s14, s14, s30 557; GFX10PLUS-NEXT: s_lshr_b32 s15, s15, s31 558; GFX10PLUS-NEXT: ; return to shader part epilog 559 %result = lshr <16 x i32> %value, %amount 560 ret <16 x i32> %result 561} 562 563define i16 @v_lshr_i16(i16 %value, i16 %amount) { 564; GFX6-LABEL: v_lshr_i16: 565; GFX6: ; %bb.0: 566; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 567; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 568; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 569; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v0 570; GFX6-NEXT: s_setpc_b64 s[30:31] 571; 572; GFX8-LABEL: v_lshr_i16: 573; GFX8: ; %bb.0: 574; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 575; GFX8-NEXT: v_lshrrev_b16_e32 v0, v1, v0 576; GFX8-NEXT: s_setpc_b64 s[30:31] 577; 578; GFX9-LABEL: v_lshr_i16: 579; GFX9: ; %bb.0: 580; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 581; GFX9-NEXT: v_lshrrev_b16_e32 v0, v1, v0 582; GFX9-NEXT: s_setpc_b64 s[30:31] 583; 584; GFX10PLUS-LABEL: v_lshr_i16: 585; GFX10PLUS: ; %bb.0: 586; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 587; GFX10PLUS-NEXT: v_lshrrev_b16 v0, v1, v0 588; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 589 %result = lshr i16 %value, %amount 590 ret i16 %result 591} 592 593define i16 @v_lshr_i16_15(i16 %value) { 594; GFX6-LABEL: v_lshr_i16_15: 595; GFX6: ; %bb.0: 596; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 597; GFX6-NEXT: v_bfe_u32 v0, v0, 15, 1 598; GFX6-NEXT: s_setpc_b64 s[30:31] 599; 600; GFX8-LABEL: v_lshr_i16_15: 601; GFX8: ; %bb.0: 602; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 603; GFX8-NEXT: v_lshrrev_b16_e32 v0, 15, v0 604; GFX8-NEXT: s_setpc_b64 s[30:31] 605; 606; GFX9-LABEL: v_lshr_i16_15: 607; GFX9: ; %bb.0: 608; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 609; GFX9-NEXT: v_lshrrev_b16_e32 v0, 15, v0 610; GFX9-NEXT: s_setpc_b64 s[30:31] 611; 612; GFX10PLUS-LABEL: v_lshr_i16_15: 613; GFX10PLUS: ; %bb.0: 614; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 615; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 15, v0 616; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 617 %result = lshr i16 %value, 15 618 ret i16 %result 619} 620 621define amdgpu_ps i16 @s_lshr_i16(i16 inreg %value, i16 inreg %amount) { 622; GCN-LABEL: s_lshr_i16: 623; GCN: ; %bb.0: 624; GCN-NEXT: s_and_b32 s0, s0, 0xffff 625; GCN-NEXT: s_lshr_b32 s0, s0, s1 626; GCN-NEXT: ; return to shader part epilog 627; 628; GFX10PLUS-LABEL: s_lshr_i16: 629; GFX10PLUS: ; %bb.0: 630; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff 631; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s1 632; GFX10PLUS-NEXT: ; return to shader part epilog 633 %result = lshr i16 %value, %amount 634 ret i16 %result 635} 636 637define amdgpu_ps i16 @s_lshr_i16_15(i16 inreg %value) { 638; GCN-LABEL: s_lshr_i16_15: 639; GCN: ; %bb.0: 640; GCN-NEXT: s_bfe_u32 s0, s0, 0x1000f 641; GCN-NEXT: ; return to shader part epilog 642; 643; GFX10PLUS-LABEL: s_lshr_i16_15: 644; GFX10PLUS: ; %bb.0: 645; GFX10PLUS-NEXT: s_bfe_u32 s0, s0, 0x1000f 646; GFX10PLUS-NEXT: ; return to shader part epilog 647 %result = lshr i16 %value, 15 648 ret i16 %result 649} 650 651define amdgpu_ps half @lshr_i16_sv(i16 inreg %value, i16 %amount) { 652; GFX6-LABEL: lshr_i16_sv: 653; GFX6: ; %bb.0: 654; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 655; GFX6-NEXT: s_and_b32 s0, s0, 0xffff 656; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 657; GFX6-NEXT: ; return to shader part epilog 658; 659; GFX8-LABEL: lshr_i16_sv: 660; GFX8: ; %bb.0: 661; GFX8-NEXT: v_lshrrev_b16_e64 v0, v0, s0 662; GFX8-NEXT: ; return to shader part epilog 663; 664; GFX9-LABEL: lshr_i16_sv: 665; GFX9: ; %bb.0: 666; GFX9-NEXT: v_lshrrev_b16_e64 v0, v0, s0 667; GFX9-NEXT: ; return to shader part epilog 668; 669; GFX10PLUS-LABEL: lshr_i16_sv: 670; GFX10PLUS: ; %bb.0: 671; GFX10PLUS-NEXT: v_lshrrev_b16 v0, v0, s0 672; GFX10PLUS-NEXT: ; return to shader part epilog 673 %result = lshr i16 %value, %amount 674 %cast = bitcast i16 %result to half 675 ret half %cast 676} 677 678define amdgpu_ps half @lshr_i16_vs(i16 %value, i16 inreg %amount) { 679; GFX6-LABEL: lshr_i16_vs: 680; GFX6: ; %bb.0: 681; GFX6-NEXT: s_and_b32 s0, s0, 0xffff 682; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 683; GFX6-NEXT: v_lshrrev_b32_e32 v0, s0, v0 684; GFX6-NEXT: ; return to shader part epilog 685; 686; GFX8-LABEL: lshr_i16_vs: 687; GFX8: ; %bb.0: 688; GFX8-NEXT: v_lshrrev_b16_e32 v0, s0, v0 689; GFX8-NEXT: ; return to shader part epilog 690; 691; GFX9-LABEL: lshr_i16_vs: 692; GFX9: ; %bb.0: 693; GFX9-NEXT: v_lshrrev_b16_e32 v0, s0, v0 694; GFX9-NEXT: ; return to shader part epilog 695; 696; GFX10PLUS-LABEL: lshr_i16_vs: 697; GFX10PLUS: ; %bb.0: 698; GFX10PLUS-NEXT: v_lshrrev_b16 v0, s0, v0 699; GFX10PLUS-NEXT: ; return to shader part epilog 700 %result = lshr i16 %value, %amount 701 %cast = bitcast i16 %result to half 702 ret half %cast 703} 704 705define <2 x i16> @v_lshr_v2i16(<2 x i16> %value, <2 x i16> %amount) { 706; GFX6-LABEL: v_lshr_v2i16: 707; GFX6: ; %bb.0: 708; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 709; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 710; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 711; GFX6-NEXT: v_lshrrev_b32_e32 v0, v2, v0 712; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v3 713; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 714; GFX6-NEXT: v_lshrrev_b32_e32 v1, v2, v1 715; GFX6-NEXT: s_setpc_b64 s[30:31] 716; 717; GFX8-LABEL: v_lshr_v2i16: 718; GFX8: ; %bb.0: 719; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 720; GFX8-NEXT: v_lshrrev_b16_e32 v2, v1, v0 721; GFX8-NEXT: v_lshrrev_b16_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 722; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 723; GFX8-NEXT: s_setpc_b64 s[30:31] 724; 725; GFX9-LABEL: v_lshr_v2i16: 726; GFX9: ; %bb.0: 727; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 728; GFX9-NEXT: v_pk_lshrrev_b16 v0, v1, v0 729; GFX9-NEXT: s_setpc_b64 s[30:31] 730; 731; GFX10PLUS-LABEL: v_lshr_v2i16: 732; GFX10PLUS: ; %bb.0: 733; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 734; GFX10PLUS-NEXT: v_pk_lshrrev_b16 v0, v1, v0 735; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 736 %result = lshr <2 x i16> %value, %amount 737 ret <2 x i16> %result 738} 739 740define <2 x i16> @v_lshr_v2i16_15(<2 x i16> %value) { 741; GFX6-LABEL: v_lshr_v2i16_15: 742; GFX6: ; %bb.0: 743; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 744; GFX6-NEXT: v_bfe_u32 v0, v0, 15, 1 745; GFX6-NEXT: v_bfe_u32 v1, v1, 15, 1 746; GFX6-NEXT: s_setpc_b64 s[30:31] 747; 748; GFX8-LABEL: v_lshr_v2i16_15: 749; GFX8: ; %bb.0: 750; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 751; GFX8-NEXT: v_mov_b32_e32 v2, 15 752; GFX8-NEXT: v_lshrrev_b16_e32 v1, 15, v0 753; GFX8-NEXT: v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 754; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 755; GFX8-NEXT: s_setpc_b64 s[30:31] 756; 757; GFX9-LABEL: v_lshr_v2i16_15: 758; GFX9: ; %bb.0: 759; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 760; GFX9-NEXT: v_pk_lshrrev_b16 v0, 15, v0 op_sel_hi:[0,1] 761; GFX9-NEXT: s_setpc_b64 s[30:31] 762; 763; GFX10PLUS-LABEL: v_lshr_v2i16_15: 764; GFX10PLUS: ; %bb.0: 765; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 766; GFX10PLUS-NEXT: v_pk_lshrrev_b16 v0, 15, v0 op_sel_hi:[0,1] 767; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 768 %result = lshr <2 x i16> %value, <i16 15, i16 15> 769 ret <2 x i16> %result 770} 771 772define amdgpu_ps i32 @s_lshr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amount) { 773; GFX6-LABEL: s_lshr_v2i16: 774; GFX6: ; %bb.0: 775; GFX6-NEXT: s_and_b32 s1, s1, 0xffff 776; GFX6-NEXT: s_and_b32 s0, s0, 0xffff 777; GFX6-NEXT: s_lshr_b32 s1, s1, s3 778; GFX6-NEXT: s_lshr_b32 s0, s0, s2 779; GFX6-NEXT: s_lshl_b32 s1, s1, 16 780; GFX6-NEXT: s_or_b32 s0, s0, s1 781; GFX6-NEXT: ; return to shader part epilog 782; 783; GFX8-LABEL: s_lshr_v2i16: 784; GFX8: ; %bb.0: 785; GFX8-NEXT: s_lshr_b32 s2, s0, 16 786; GFX8-NEXT: s_and_b32 s0, s0, 0xffff 787; GFX8-NEXT: s_lshr_b32 s3, s1, 16 788; GFX8-NEXT: s_lshr_b32 s0, s0, s1 789; GFX8-NEXT: s_lshr_b32 s1, s2, s3 790; GFX8-NEXT: s_lshl_b32 s1, s1, 16 791; GFX8-NEXT: s_and_b32 s0, s0, 0xffff 792; GFX8-NEXT: s_or_b32 s0, s1, s0 793; GFX8-NEXT: ; return to shader part epilog 794; 795; GFX9-LABEL: s_lshr_v2i16: 796; GFX9: ; %bb.0: 797; GFX9-NEXT: s_lshr_b32 s2, s0, 16 798; GFX9-NEXT: s_and_b32 s0, s0, 0xffff 799; GFX9-NEXT: s_lshr_b32 s3, s1, 16 800; GFX9-NEXT: s_lshr_b32 s0, s0, s1 801; GFX9-NEXT: s_lshr_b32 s1, s2, s3 802; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s1 803; GFX9-NEXT: ; return to shader part epilog 804; 805; GFX10PLUS-LABEL: s_lshr_v2i16: 806; GFX10PLUS: ; %bb.0: 807; GFX10PLUS-NEXT: s_lshr_b32 s2, s0, 16 808; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff 809; GFX10PLUS-NEXT: s_lshr_b32 s3, s1, 16 810; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s1 811; GFX10PLUS-NEXT: s_lshr_b32 s1, s2, s3 812; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s0, s1 813; GFX10PLUS-NEXT: ; return to shader part epilog 814 %result = lshr <2 x i16> %value, %amount 815 %cast = bitcast <2 x i16> %result to i32 816 ret i32 %cast 817} 818 819define amdgpu_ps float @lshr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) { 820; GFX6-LABEL: lshr_v2i16_sv: 821; GFX6: ; %bb.0: 822; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 823; GFX6-NEXT: s_and_b32 s0, s0, 0xffff 824; GFX6-NEXT: v_lshr_b32_e32 v0, s0, v0 825; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 826; GFX6-NEXT: s_and_b32 s0, s1, 0xffff 827; GFX6-NEXT: v_lshr_b32_e32 v1, s0, v1 828; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 829; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 830; GFX6-NEXT: ; return to shader part epilog 831; 832; GFX8-LABEL: lshr_v2i16_sv: 833; GFX8: ; %bb.0: 834; GFX8-NEXT: s_lshr_b32 s1, s0, 16 835; GFX8-NEXT: v_mov_b32_e32 v2, s1 836; GFX8-NEXT: v_lshrrev_b16_e64 v1, v0, s0 837; GFX8-NEXT: v_lshrrev_b16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 838; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 839; GFX8-NEXT: ; return to shader part epilog 840; 841; GFX9-LABEL: lshr_v2i16_sv: 842; GFX9: ; %bb.0: 843; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, s0 844; GFX9-NEXT: ; return to shader part epilog 845; 846; GFX10PLUS-LABEL: lshr_v2i16_sv: 847; GFX10PLUS: ; %bb.0: 848; GFX10PLUS-NEXT: v_pk_lshrrev_b16 v0, v0, s0 849; GFX10PLUS-NEXT: ; return to shader part epilog 850 %result = lshr <2 x i16> %value, %amount 851 %cast = bitcast <2 x i16> %result to float 852 ret float %cast 853} 854 855define amdgpu_ps float @lshr_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount) { 856; GFX6-LABEL: lshr_v2i16_vs: 857; GFX6: ; %bb.0: 858; GFX6-NEXT: s_and_b32 s0, s0, 0xffff 859; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 860; GFX6-NEXT: v_lshrrev_b32_e32 v0, s0, v0 861; GFX6-NEXT: s_and_b32 s0, s1, 0xffff 862; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 863; GFX6-NEXT: v_lshrrev_b32_e32 v1, s0, v1 864; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 865; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 866; GFX6-NEXT: ; return to shader part epilog 867; 868; GFX8-LABEL: lshr_v2i16_vs: 869; GFX8: ; %bb.0: 870; GFX8-NEXT: s_lshr_b32 s1, s0, 16 871; GFX8-NEXT: v_mov_b32_e32 v2, s1 872; GFX8-NEXT: v_lshrrev_b16_e32 v1, s0, v0 873; GFX8-NEXT: v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 874; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 875; GFX8-NEXT: ; return to shader part epilog 876; 877; GFX9-LABEL: lshr_v2i16_vs: 878; GFX9: ; %bb.0: 879; GFX9-NEXT: v_pk_lshrrev_b16 v0, s0, v0 880; GFX9-NEXT: ; return to shader part epilog 881; 882; GFX10PLUS-LABEL: lshr_v2i16_vs: 883; GFX10PLUS: ; %bb.0: 884; GFX10PLUS-NEXT: v_pk_lshrrev_b16 v0, s0, v0 885; GFX10PLUS-NEXT: ; return to shader part epilog 886 %result = lshr <2 x i16> %value, %amount 887 %cast = bitcast <2 x i16> %result to float 888 ret float %cast 889} 890 891; FIXME 892; define <3 x i16> @v_lshr_v3i16(<3 x i16> %value, <3 x i16> %amount) { 893; %result = lshr <3 x i16> %value, %amount 894; ret <3 x i16> %result 895; } 896 897; define amdgpu_ps <3 x i16> @s_lshr_v3i16(<3 x i16> inreg %value, <3 x i16> inreg %amount) { 898; %result = lshr <3 x i16> %value, %amount 899; ret <3 x i16> %result 900; } 901 902define <2 x float> @v_lshr_v4i16(<4 x i16> %value, <4 x i16> %amount) { 903; GFX6-LABEL: v_lshr_v4i16: 904; GFX6: ; %bb.0: 905; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 906; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 907; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 908; GFX6-NEXT: v_lshrrev_b32_e32 v0, v4, v0 909; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v5 910; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 911; GFX6-NEXT: v_lshrrev_b32_e32 v1, v4, v1 912; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v6 913; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 914; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v2 915; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v7 916; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 917; GFX6-NEXT: v_lshrrev_b32_e32 v3, v4, v3 918; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 919; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 920; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 921; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 922; GFX6-NEXT: s_setpc_b64 s[30:31] 923; 924; GFX8-LABEL: v_lshr_v4i16: 925; GFX8: ; %bb.0: 926; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 927; GFX8-NEXT: v_lshrrev_b16_e32 v4, v2, v0 928; GFX8-NEXT: v_lshrrev_b16_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 929; GFX8-NEXT: v_lshrrev_b16_e32 v2, v3, v1 930; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 931; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 932; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 933; GFX8-NEXT: s_setpc_b64 s[30:31] 934; 935; GFX9-LABEL: v_lshr_v4i16: 936; GFX9: ; %bb.0: 937; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 938; GFX9-NEXT: v_pk_lshrrev_b16 v0, v2, v0 939; GFX9-NEXT: v_pk_lshrrev_b16 v1, v3, v1 940; GFX9-NEXT: s_setpc_b64 s[30:31] 941; 942; GFX10PLUS-LABEL: v_lshr_v4i16: 943; GFX10PLUS: ; %bb.0: 944; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 945; GFX10PLUS-NEXT: v_pk_lshrrev_b16 v0, v2, v0 946; GFX10PLUS-NEXT: v_pk_lshrrev_b16 v1, v3, v1 947; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 948 %result = lshr <4 x i16> %value, %amount 949 %cast = bitcast <4 x i16> %result to <2 x float> 950 ret <2 x float> %cast 951} 952 953define amdgpu_ps <2 x i32> @s_lshr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg %amount) { 954; GFX6-LABEL: s_lshr_v4i16: 955; GFX6: ; %bb.0: 956; GFX6-NEXT: s_and_b32 s1, s1, 0xffff 957; GFX6-NEXT: s_and_b32 s0, s0, 0xffff 958; GFX6-NEXT: s_lshr_b32 s1, s1, s5 959; GFX6-NEXT: s_and_b32 s3, s3, 0xffff 960; GFX6-NEXT: s_lshr_b32 s0, s0, s4 961; GFX6-NEXT: s_and_b32 s2, s2, 0xffff 962; GFX6-NEXT: s_lshr_b32 s3, s3, s7 963; GFX6-NEXT: s_lshl_b32 s1, s1, 16 964; GFX6-NEXT: s_lshr_b32 s2, s2, s6 965; GFX6-NEXT: s_or_b32 s0, s0, s1 966; GFX6-NEXT: s_lshl_b32 s1, s3, 16 967; GFX6-NEXT: s_or_b32 s1, s2, s1 968; GFX6-NEXT: ; return to shader part epilog 969; 970; GFX8-LABEL: s_lshr_v4i16: 971; GFX8: ; %bb.0: 972; GFX8-NEXT: s_lshr_b32 s4, s0, 16 973; GFX8-NEXT: s_and_b32 s0, s0, 0xffff 974; GFX8-NEXT: s_lshr_b32 s6, s2, 16 975; GFX8-NEXT: s_lshr_b32 s5, s1, 16 976; GFX8-NEXT: s_and_b32 s1, s1, 0xffff 977; GFX8-NEXT: s_lshr_b32 s7, s3, 16 978; GFX8-NEXT: s_lshr_b32 s0, s0, s2 979; GFX8-NEXT: s_lshr_b32 s2, s4, s6 980; GFX8-NEXT: s_lshr_b32 s1, s1, s3 981; GFX8-NEXT: s_lshr_b32 s3, s5, s7 982; GFX8-NEXT: s_lshl_b32 s2, s2, 16 983; GFX8-NEXT: s_and_b32 s0, s0, 0xffff 984; GFX8-NEXT: s_or_b32 s0, s2, s0 985; GFX8-NEXT: s_lshl_b32 s2, s3, 16 986; GFX8-NEXT: s_and_b32 s1, s1, 0xffff 987; GFX8-NEXT: s_or_b32 s1, s2, s1 988; GFX8-NEXT: ; return to shader part epilog 989; 990; GFX9-LABEL: s_lshr_v4i16: 991; GFX9: ; %bb.0: 992; GFX9-NEXT: s_lshr_b32 s4, s0, 16 993; GFX9-NEXT: s_and_b32 s0, s0, 0xffff 994; GFX9-NEXT: s_lshr_b32 s5, s2, 16 995; GFX9-NEXT: s_lshr_b32 s0, s0, s2 996; GFX9-NEXT: s_lshr_b32 s2, s4, s5 997; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 998; GFX9-NEXT: s_lshr_b32 s2, s1, 16 999; GFX9-NEXT: s_and_b32 s1, s1, 0xffff 1000; GFX9-NEXT: s_lshr_b32 s4, s3, 16 1001; GFX9-NEXT: s_lshr_b32 s1, s1, s3 1002; GFX9-NEXT: s_lshr_b32 s2, s2, s4 1003; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 1004; GFX9-NEXT: ; return to shader part epilog 1005; 1006; GFX10PLUS-LABEL: s_lshr_v4i16: 1007; GFX10PLUS: ; %bb.0: 1008; GFX10PLUS-NEXT: s_lshr_b32 s4, s0, 16 1009; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff 1010; GFX10PLUS-NEXT: s_lshr_b32 s5, s2, 16 1011; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s2 1012; GFX10PLUS-NEXT: s_lshr_b32 s2, s4, s5 1013; GFX10PLUS-NEXT: s_lshr_b32 s4, s1, 16 1014; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff 1015; GFX10PLUS-NEXT: s_lshr_b32 s5, s3, 16 1016; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, s3 1017; GFX10PLUS-NEXT: s_lshr_b32 s3, s4, s5 1018; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s0, s2 1019; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s1, s3 1020; GFX10PLUS-NEXT: ; return to shader part epilog 1021 %result = lshr <4 x i16> %value, %amount 1022 %cast = bitcast <4 x i16> %result to <2 x i32> 1023 ret <2 x i32> %cast 1024} 1025 1026; FIXME 1027; define <5 x i16> @v_lshr_v5i16(<5 x i16> %value, <5 x i16> %amount) { 1028; %result = lshr <5 x i16> %value, %amount 1029; ret <5 x i16> %result 1030; } 1031 1032; define amdgpu_ps <5 x i16> @s_lshr_v5i16(<5 x i16> inreg %value, <5 x i16> inreg %amount) { 1033; %result = lshr <5 x i16> %value, %amount 1034; ret <5 x i16> %result 1035; } 1036 1037; define <3 x float> @v_lshr_v6i16(<6 x i16> %value, <6 x i16> %amount) { 1038; %result = lshr <6 x i16> %value, %amount 1039; %cast = bitcast <6 x i16> %result to <3 x float> 1040; ret <3 x float> %cast 1041; } 1042 1043; define amdgpu_ps <3 x i32> @s_lshr_v6i16(<6 x i16> inreg %value, <6 x i16> inreg %amount) { 1044; %result = lshr <6 x i16> %value, %amount 1045; %cast = bitcast <6 x i16> %result to <3 x i32> 1046; ret <3 x i32> %cast 1047; } 1048 1049define <4 x float> @v_lshr_v8i16(<8 x i16> %value, <8 x i16> %amount) { 1050; GFX6-LABEL: v_lshr_v8i16: 1051; GFX6: ; %bb.0: 1052; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1053; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v8 1054; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 1055; GFX6-NEXT: v_lshrrev_b32_e32 v0, v8, v0 1056; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v9 1057; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 1058; GFX6-NEXT: v_lshrrev_b32_e32 v1, v8, v1 1059; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v10 1060; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 1061; GFX6-NEXT: v_lshrrev_b32_e32 v2, v8, v2 1062; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v11 1063; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 1064; GFX6-NEXT: v_lshrrev_b32_e32 v3, v8, v3 1065; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v12 1066; GFX6-NEXT: v_and_b32_e32 v4, 0xffff, v4 1067; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v4 1068; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v13 1069; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v5 1070; GFX6-NEXT: v_lshrrev_b32_e32 v5, v8, v5 1071; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v14 1072; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v6 1073; GFX6-NEXT: v_lshrrev_b32_e32 v6, v8, v6 1074; GFX6-NEXT: v_and_b32_e32 v8, 0xffff, v15 1075; GFX6-NEXT: v_and_b32_e32 v7, 0xffff, v7 1076; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1077; GFX6-NEXT: v_lshrrev_b32_e32 v7, v8, v7 1078; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 1079; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 1080; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 1081; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 1082; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7 1083; GFX6-NEXT: v_or_b32_e32 v2, v4, v2 1084; GFX6-NEXT: v_or_b32_e32 v3, v6, v3 1085; GFX6-NEXT: s_setpc_b64 s[30:31] 1086; 1087; GFX8-LABEL: v_lshr_v8i16: 1088; GFX8: ; %bb.0: 1089; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1090; GFX8-NEXT: v_lshrrev_b16_e32 v8, v4, v0 1091; GFX8-NEXT: v_lshrrev_b16_sdwa v0, v4, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1092; GFX8-NEXT: v_lshrrev_b16_e32 v4, v5, v1 1093; GFX8-NEXT: v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1094; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 1095; GFX8-NEXT: v_lshrrev_b16_e32 v4, v6, v2 1096; GFX8-NEXT: v_lshrrev_b16_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1097; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 1098; GFX8-NEXT: v_lshrrev_b16_e32 v4, v7, v3 1099; GFX8-NEXT: v_lshrrev_b16_sdwa v3, v7, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1100; GFX8-NEXT: v_or_b32_e32 v0, v8, v0 1101; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 1102; GFX8-NEXT: s_setpc_b64 s[30:31] 1103; 1104; GFX9-LABEL: v_lshr_v8i16: 1105; GFX9: ; %bb.0: 1106; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1107; GFX9-NEXT: v_pk_lshrrev_b16 v0, v4, v0 1108; GFX9-NEXT: v_pk_lshrrev_b16 v1, v5, v1 1109; GFX9-NEXT: v_pk_lshrrev_b16 v2, v6, v2 1110; GFX9-NEXT: v_pk_lshrrev_b16 v3, v7, v3 1111; GFX9-NEXT: s_setpc_b64 s[30:31] 1112; 1113; GFX10PLUS-LABEL: v_lshr_v8i16: 1114; GFX10PLUS: ; %bb.0: 1115; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1116; GFX10PLUS-NEXT: v_pk_lshrrev_b16 v0, v4, v0 1117; GFX10PLUS-NEXT: v_pk_lshrrev_b16 v1, v5, v1 1118; GFX10PLUS-NEXT: v_pk_lshrrev_b16 v2, v6, v2 1119; GFX10PLUS-NEXT: v_pk_lshrrev_b16 v3, v7, v3 1120; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 1121 %result = lshr <8 x i16> %value, %amount 1122 %cast = bitcast <8 x i16> %result to <4 x float> 1123 ret <4 x float> %cast 1124} 1125 1126define amdgpu_ps <4 x i32> @s_lshr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg %amount) { 1127; GFX6-LABEL: s_lshr_v8i16: 1128; GFX6: ; %bb.0: 1129; GFX6-NEXT: s_and_b32 s1, s1, 0xffff 1130; GFX6-NEXT: s_and_b32 s0, s0, 0xffff 1131; GFX6-NEXT: s_lshr_b32 s1, s1, s9 1132; GFX6-NEXT: s_and_b32 s3, s3, 0xffff 1133; GFX6-NEXT: s_lshr_b32 s0, s0, s8 1134; GFX6-NEXT: s_and_b32 s2, s2, 0xffff 1135; GFX6-NEXT: s_lshr_b32 s3, s3, s11 1136; GFX6-NEXT: s_and_b32 s5, s5, 0xffff 1137; GFX6-NEXT: s_and_b32 s7, s7, 0xffff 1138; GFX6-NEXT: s_lshl_b32 s1, s1, 16 1139; GFX6-NEXT: s_lshr_b32 s2, s2, s10 1140; GFX6-NEXT: s_and_b32 s4, s4, 0xffff 1141; GFX6-NEXT: s_lshr_b32 s5, s5, s13 1142; GFX6-NEXT: s_and_b32 s6, s6, 0xffff 1143; GFX6-NEXT: s_lshr_b32 s7, s7, s15 1144; GFX6-NEXT: s_or_b32 s0, s0, s1 1145; GFX6-NEXT: s_lshl_b32 s1, s3, 16 1146; GFX6-NEXT: s_lshr_b32 s4, s4, s12 1147; GFX6-NEXT: s_lshr_b32 s6, s6, s14 1148; GFX6-NEXT: s_or_b32 s1, s2, s1 1149; GFX6-NEXT: s_lshl_b32 s2, s5, 16 1150; GFX6-NEXT: s_lshl_b32 s3, s7, 16 1151; GFX6-NEXT: s_or_b32 s2, s4, s2 1152; GFX6-NEXT: s_or_b32 s3, s6, s3 1153; GFX6-NEXT: ; return to shader part epilog 1154; 1155; GFX8-LABEL: s_lshr_v8i16: 1156; GFX8: ; %bb.0: 1157; GFX8-NEXT: s_lshr_b32 s8, s0, 16 1158; GFX8-NEXT: s_and_b32 s0, s0, 0xffff 1159; GFX8-NEXT: s_lshr_b32 s12, s4, 16 1160; GFX8-NEXT: s_lshr_b32 s9, s1, 16 1161; GFX8-NEXT: s_and_b32 s1, s1, 0xffff 1162; GFX8-NEXT: s_lshr_b32 s13, s5, 16 1163; GFX8-NEXT: s_lshr_b32 s0, s0, s4 1164; GFX8-NEXT: s_lshr_b32 s4, s8, s12 1165; GFX8-NEXT: s_lshr_b32 s10, s2, 16 1166; GFX8-NEXT: s_and_b32 s2, s2, 0xffff 1167; GFX8-NEXT: s_lshr_b32 s14, s6, 16 1168; GFX8-NEXT: s_lshr_b32 s1, s1, s5 1169; GFX8-NEXT: s_lshr_b32 s5, s9, s13 1170; GFX8-NEXT: s_lshl_b32 s4, s4, 16 1171; GFX8-NEXT: s_and_b32 s0, s0, 0xffff 1172; GFX8-NEXT: s_lshr_b32 s11, s3, 16 1173; GFX8-NEXT: s_and_b32 s3, s3, 0xffff 1174; GFX8-NEXT: s_lshr_b32 s15, s7, 16 1175; GFX8-NEXT: s_lshr_b32 s2, s2, s6 1176; GFX8-NEXT: s_lshr_b32 s6, s10, s14 1177; GFX8-NEXT: s_or_b32 s0, s4, s0 1178; GFX8-NEXT: s_lshl_b32 s4, s5, 16 1179; GFX8-NEXT: s_and_b32 s1, s1, 0xffff 1180; GFX8-NEXT: s_lshr_b32 s3, s3, s7 1181; GFX8-NEXT: s_lshr_b32 s7, s11, s15 1182; GFX8-NEXT: s_or_b32 s1, s4, s1 1183; GFX8-NEXT: s_lshl_b32 s4, s6, 16 1184; GFX8-NEXT: s_and_b32 s2, s2, 0xffff 1185; GFX8-NEXT: s_or_b32 s2, s4, s2 1186; GFX8-NEXT: s_lshl_b32 s4, s7, 16 1187; GFX8-NEXT: s_and_b32 s3, s3, 0xffff 1188; GFX8-NEXT: s_or_b32 s3, s4, s3 1189; GFX8-NEXT: ; return to shader part epilog 1190; 1191; GFX9-LABEL: s_lshr_v8i16: 1192; GFX9: ; %bb.0: 1193; GFX9-NEXT: s_lshr_b32 s8, s0, 16 1194; GFX9-NEXT: s_and_b32 s0, s0, 0xffff 1195; GFX9-NEXT: s_lshr_b32 s9, s4, 16 1196; GFX9-NEXT: s_lshr_b32 s0, s0, s4 1197; GFX9-NEXT: s_lshr_b32 s4, s8, s9 1198; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 1199; GFX9-NEXT: s_lshr_b32 s4, s1, 16 1200; GFX9-NEXT: s_and_b32 s1, s1, 0xffff 1201; GFX9-NEXT: s_lshr_b32 s8, s5, 16 1202; GFX9-NEXT: s_lshr_b32 s1, s1, s5 1203; GFX9-NEXT: s_lshr_b32 s4, s4, s8 1204; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 1205; GFX9-NEXT: s_lshr_b32 s4, s2, 16 1206; GFX9-NEXT: s_and_b32 s2, s2, 0xffff 1207; GFX9-NEXT: s_lshr_b32 s5, s6, 16 1208; GFX9-NEXT: s_lshr_b32 s2, s2, s6 1209; GFX9-NEXT: s_lshr_b32 s4, s4, s5 1210; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 1211; GFX9-NEXT: s_lshr_b32 s4, s3, 16 1212; GFX9-NEXT: s_and_b32 s3, s3, 0xffff 1213; GFX9-NEXT: s_lshr_b32 s5, s7, 16 1214; GFX9-NEXT: s_lshr_b32 s3, s3, s7 1215; GFX9-NEXT: s_lshr_b32 s4, s4, s5 1216; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 1217; GFX9-NEXT: ; return to shader part epilog 1218; 1219; GFX10PLUS-LABEL: s_lshr_v8i16: 1220; GFX10PLUS: ; %bb.0: 1221; GFX10PLUS-NEXT: s_lshr_b32 s8, s0, 16 1222; GFX10PLUS-NEXT: s_and_b32 s0, s0, 0xffff 1223; GFX10PLUS-NEXT: s_lshr_b32 s9, s4, 16 1224; GFX10PLUS-NEXT: s_lshr_b32 s0, s0, s4 1225; GFX10PLUS-NEXT: s_lshr_b32 s4, s8, s9 1226; GFX10PLUS-NEXT: s_lshr_b32 s8, s1, 16 1227; GFX10PLUS-NEXT: s_and_b32 s1, s1, 0xffff 1228; GFX10PLUS-NEXT: s_lshr_b32 s9, s5, 16 1229; GFX10PLUS-NEXT: s_lshr_b32 s1, s1, s5 1230; GFX10PLUS-NEXT: s_lshr_b32 s5, s8, s9 1231; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s0, s0, s4 1232; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s1, s1, s5 1233; GFX10PLUS-NEXT: s_lshr_b32 s4, s2, 16 1234; GFX10PLUS-NEXT: s_and_b32 s2, s2, 0xffff 1235; GFX10PLUS-NEXT: s_lshr_b32 s5, s6, 16 1236; GFX10PLUS-NEXT: s_lshr_b32 s2, s2, s6 1237; GFX10PLUS-NEXT: s_lshr_b32 s4, s4, s5 1238; GFX10PLUS-NEXT: s_lshr_b32 s5, s3, 16 1239; GFX10PLUS-NEXT: s_and_b32 s3, s3, 0xffff 1240; GFX10PLUS-NEXT: s_lshr_b32 s6, s7, 16 1241; GFX10PLUS-NEXT: s_lshr_b32 s3, s3, s7 1242; GFX10PLUS-NEXT: s_lshr_b32 s5, s5, s6 1243; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s2, s2, s4 1244; GFX10PLUS-NEXT: s_pack_ll_b32_b16 s3, s3, s5 1245; GFX10PLUS-NEXT: ; return to shader part epilog 1246 %result = lshr <8 x i16> %value, %amount 1247 %cast = bitcast <8 x i16> %result to <4 x i32> 1248 ret <4 x i32> %cast 1249} 1250 1251define i64 @v_lshr_i64(i64 %value, i64 %amount) { 1252; GFX6-LABEL: v_lshr_i64: 1253; GFX6: ; %bb.0: 1254; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1255; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], v2 1256; GFX6-NEXT: s_setpc_b64 s[30:31] 1257; 1258; GFX8-LABEL: v_lshr_i64: 1259; GFX8: ; %bb.0: 1260; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1261; GFX8-NEXT: v_lshrrev_b64 v[0:1], v2, v[0:1] 1262; GFX8-NEXT: s_setpc_b64 s[30:31] 1263; 1264; GFX9-LABEL: v_lshr_i64: 1265; GFX9: ; %bb.0: 1266; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1267; GFX9-NEXT: v_lshrrev_b64 v[0:1], v2, v[0:1] 1268; GFX9-NEXT: s_setpc_b64 s[30:31] 1269; 1270; GFX10PLUS-LABEL: v_lshr_i64: 1271; GFX10PLUS: ; %bb.0: 1272; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1273; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], v2, v[0:1] 1274; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 1275 %result = lshr i64 %value, %amount 1276 ret i64 %result 1277} 1278 1279define i64 @v_lshr_i64_63(i64 %value) { 1280; GCN-LABEL: v_lshr_i64_63: 1281; GCN: ; %bb.0: 1282; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1283; GCN-NEXT: v_lshrrev_b32_e32 v0, 31, v1 1284; GCN-NEXT: v_mov_b32_e32 v1, 0 1285; GCN-NEXT: s_setpc_b64 s[30:31] 1286; 1287; GFX10PLUS-LABEL: v_lshr_i64_63: 1288; GFX10PLUS: ; %bb.0: 1289; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1290; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, 31, v1 1291; GFX10PLUS-NEXT: v_mov_b32_e32 v1, 0 1292; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 1293 %result = lshr i64 %value, 63 1294 ret i64 %result 1295} 1296 1297define i64 @v_lshr_i64_33(i64 %value) { 1298; GCN-LABEL: v_lshr_i64_33: 1299; GCN: ; %bb.0: 1300; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1301; GCN-NEXT: v_lshrrev_b32_e32 v0, 1, v1 1302; GCN-NEXT: v_mov_b32_e32 v1, 0 1303; GCN-NEXT: s_setpc_b64 s[30:31] 1304; 1305; GFX10PLUS-LABEL: v_lshr_i64_33: 1306; GFX10PLUS: ; %bb.0: 1307; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1308; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, 1, v1 1309; GFX10PLUS-NEXT: v_mov_b32_e32 v1, 0 1310; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 1311 %result = lshr i64 %value, 33 1312 ret i64 %result 1313} 1314 1315define i64 @v_lshr_i64_32(i64 %value) { 1316; GCN-LABEL: v_lshr_i64_32: 1317; GCN: ; %bb.0: 1318; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1319; GCN-NEXT: v_mov_b32_e32 v0, v1 1320; GCN-NEXT: v_mov_b32_e32 v1, 0 1321; GCN-NEXT: s_setpc_b64 s[30:31] 1322; 1323; GFX10-LABEL: v_lshr_i64_32: 1324; GFX10: ; %bb.0: 1325; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1326; GFX10-NEXT: v_mov_b32_e32 v0, v1 1327; GFX10-NEXT: v_mov_b32_e32 v1, 0 1328; GFX10-NEXT: s_setpc_b64 s[30:31] 1329; 1330; GFX11-LABEL: v_lshr_i64_32: 1331; GFX11: ; %bb.0: 1332; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1333; GFX11-NEXT: v_dual_mov_b32 v0, v1 :: v_dual_mov_b32 v1, 0 1334; GFX11-NEXT: s_setpc_b64 s[30:31] 1335 %result = lshr i64 %value, 32 1336 ret i64 %result 1337} 1338 1339define i64 @v_lshr_i64_31(i64 %value) { 1340; GFX6-LABEL: v_lshr_i64_31: 1341; GFX6: ; %bb.0: 1342; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1343; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], 31 1344; GFX6-NEXT: s_setpc_b64 s[30:31] 1345; 1346; GFX8-LABEL: v_lshr_i64_31: 1347; GFX8: ; %bb.0: 1348; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1349; GFX8-NEXT: v_lshrrev_b64 v[0:1], 31, v[0:1] 1350; GFX8-NEXT: s_setpc_b64 s[30:31] 1351; 1352; GFX9-LABEL: v_lshr_i64_31: 1353; GFX9: ; %bb.0: 1354; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1355; GFX9-NEXT: v_lshrrev_b64 v[0:1], 31, v[0:1] 1356; GFX9-NEXT: s_setpc_b64 s[30:31] 1357; 1358; GFX10PLUS-LABEL: v_lshr_i64_31: 1359; GFX10PLUS: ; %bb.0: 1360; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1361; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 31, v[0:1] 1362; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 1363 %result = lshr i64 %value, 31 1364 ret i64 %result 1365} 1366 1367define amdgpu_ps i64 @s_lshr_i64(i64 inreg %value, i64 inreg %amount) { 1368; GCN-LABEL: s_lshr_i64: 1369; GCN: ; %bb.0: 1370; GCN-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 1371; GCN-NEXT: ; return to shader part epilog 1372; 1373; GFX10PLUS-LABEL: s_lshr_i64: 1374; GFX10PLUS: ; %bb.0: 1375; GFX10PLUS-NEXT: s_lshr_b64 s[0:1], s[0:1], s2 1376; GFX10PLUS-NEXT: ; return to shader part epilog 1377 %result = lshr i64 %value, %amount 1378 ret i64 %result 1379} 1380 1381define amdgpu_ps i64 @s_lshr_i64_63(i64 inreg %value) { 1382; GCN-LABEL: s_lshr_i64_63: 1383; GCN: ; %bb.0: 1384; GCN-NEXT: s_lshr_b32 s0, s1, 31 1385; GCN-NEXT: s_mov_b32 s1, 0 1386; GCN-NEXT: ; return to shader part epilog 1387; 1388; GFX10PLUS-LABEL: s_lshr_i64_63: 1389; GFX10PLUS: ; %bb.0: 1390; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 31 1391; GFX10PLUS-NEXT: s_mov_b32 s1, 0 1392; GFX10PLUS-NEXT: ; return to shader part epilog 1393 %result = lshr i64 %value, 63 1394 ret i64 %result 1395} 1396 1397define amdgpu_ps i64 @s_lshr_i64_33(i64 inreg %value) { 1398; GCN-LABEL: s_lshr_i64_33: 1399; GCN: ; %bb.0: 1400; GCN-NEXT: s_lshr_b32 s0, s1, 1 1401; GCN-NEXT: s_mov_b32 s1, 0 1402; GCN-NEXT: ; return to shader part epilog 1403; 1404; GFX10PLUS-LABEL: s_lshr_i64_33: 1405; GFX10PLUS: ; %bb.0: 1406; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 1 1407; GFX10PLUS-NEXT: s_mov_b32 s1, 0 1408; GFX10PLUS-NEXT: ; return to shader part epilog 1409 %result = lshr i64 %value, 33 1410 ret i64 %result 1411} 1412 1413define amdgpu_ps i64 @s_lshr_i64_32(i64 inreg %value) { 1414; GCN-LABEL: s_lshr_i64_32: 1415; GCN: ; %bb.0: 1416; GCN-NEXT: s_mov_b32 s0, s1 1417; GCN-NEXT: s_mov_b32 s1, 0 1418; GCN-NEXT: ; return to shader part epilog 1419; 1420; GFX10PLUS-LABEL: s_lshr_i64_32: 1421; GFX10PLUS: ; %bb.0: 1422; GFX10PLUS-NEXT: s_mov_b32 s0, s1 1423; GFX10PLUS-NEXT: s_mov_b32 s1, 0 1424; GFX10PLUS-NEXT: ; return to shader part epilog 1425 %result = lshr i64 %value, 32 1426 ret i64 %result 1427} 1428 1429define amdgpu_ps i64 @s_lshr_i64_31(i64 inreg %value) { 1430; GCN-LABEL: s_lshr_i64_31: 1431; GCN: ; %bb.0: 1432; GCN-NEXT: s_lshr_b64 s[0:1], s[0:1], 31 1433; GCN-NEXT: ; return to shader part epilog 1434; 1435; GFX10PLUS-LABEL: s_lshr_i64_31: 1436; GFX10PLUS: ; %bb.0: 1437; GFX10PLUS-NEXT: s_lshr_b64 s[0:1], s[0:1], 31 1438; GFX10PLUS-NEXT: ; return to shader part epilog 1439 %result = lshr i64 %value, 31 1440 ret i64 %result 1441} 1442 1443define amdgpu_ps <2 x float> @lshr_i64_sv(i64 inreg %value, i64 %amount) { 1444; GFX6-LABEL: lshr_i64_sv: 1445; GFX6: ; %bb.0: 1446; GFX6-NEXT: v_lshr_b64 v[0:1], s[0:1], v0 1447; GFX6-NEXT: ; return to shader part epilog 1448; 1449; GFX8-LABEL: lshr_i64_sv: 1450; GFX8: ; %bb.0: 1451; GFX8-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1] 1452; GFX8-NEXT: ; return to shader part epilog 1453; 1454; GFX9-LABEL: lshr_i64_sv: 1455; GFX9: ; %bb.0: 1456; GFX9-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1] 1457; GFX9-NEXT: ; return to shader part epilog 1458; 1459; GFX10PLUS-LABEL: lshr_i64_sv: 1460; GFX10PLUS: ; %bb.0: 1461; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], v0, s[0:1] 1462; GFX10PLUS-NEXT: ; return to shader part epilog 1463 %result = lshr i64 %value, %amount 1464 %cast = bitcast i64 %result to <2 x float> 1465 ret <2 x float> %cast 1466} 1467 1468define amdgpu_ps <2 x float> @lshr_i64_vs(i64 %value, i64 inreg %amount) { 1469; GFX6-LABEL: lshr_i64_vs: 1470; GFX6: ; %bb.0: 1471; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], s0 1472; GFX6-NEXT: ; return to shader part epilog 1473; 1474; GFX8-LABEL: lshr_i64_vs: 1475; GFX8: ; %bb.0: 1476; GFX8-NEXT: v_lshrrev_b64 v[0:1], s0, v[0:1] 1477; GFX8-NEXT: ; return to shader part epilog 1478; 1479; GFX9-LABEL: lshr_i64_vs: 1480; GFX9: ; %bb.0: 1481; GFX9-NEXT: v_lshrrev_b64 v[0:1], s0, v[0:1] 1482; GFX9-NEXT: ; return to shader part epilog 1483; 1484; GFX10PLUS-LABEL: lshr_i64_vs: 1485; GFX10PLUS: ; %bb.0: 1486; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], s0, v[0:1] 1487; GFX10PLUS-NEXT: ; return to shader part epilog 1488 %result = lshr i64 %value, %amount 1489 %cast = bitcast i64 %result to <2 x float> 1490 ret <2 x float> %cast 1491} 1492 1493define <2 x i64> @v_lshr_v2i64(<2 x i64> %value, <2 x i64> %amount) { 1494; GFX6-LABEL: v_lshr_v2i64: 1495; GFX6: ; %bb.0: 1496; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1497; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], v4 1498; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], v6 1499; GFX6-NEXT: s_setpc_b64 s[30:31] 1500; 1501; GFX8-LABEL: v_lshr_v2i64: 1502; GFX8: ; %bb.0: 1503; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1504; GFX8-NEXT: v_lshrrev_b64 v[0:1], v4, v[0:1] 1505; GFX8-NEXT: v_lshrrev_b64 v[2:3], v6, v[2:3] 1506; GFX8-NEXT: s_setpc_b64 s[30:31] 1507; 1508; GFX9-LABEL: v_lshr_v2i64: 1509; GFX9: ; %bb.0: 1510; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1511; GFX9-NEXT: v_lshrrev_b64 v[0:1], v4, v[0:1] 1512; GFX9-NEXT: v_lshrrev_b64 v[2:3], v6, v[2:3] 1513; GFX9-NEXT: s_setpc_b64 s[30:31] 1514; 1515; GFX10PLUS-LABEL: v_lshr_v2i64: 1516; GFX10PLUS: ; %bb.0: 1517; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1518; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], v4, v[0:1] 1519; GFX10PLUS-NEXT: v_lshrrev_b64 v[2:3], v6, v[2:3] 1520; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 1521 %result = lshr <2 x i64> %value, %amount 1522 ret <2 x i64> %result 1523} 1524 1525define <2 x i64> @v_lshr_v2i64_31(<2 x i64> %value) { 1526; GFX6-LABEL: v_lshr_v2i64_31: 1527; GFX6: ; %bb.0: 1528; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1529; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], 31 1530; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], 31 1531; GFX6-NEXT: s_setpc_b64 s[30:31] 1532; 1533; GFX8-LABEL: v_lshr_v2i64_31: 1534; GFX8: ; %bb.0: 1535; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1536; GFX8-NEXT: v_lshrrev_b64 v[0:1], 31, v[0:1] 1537; GFX8-NEXT: v_lshrrev_b64 v[2:3], 31, v[2:3] 1538; GFX8-NEXT: s_setpc_b64 s[30:31] 1539; 1540; GFX9-LABEL: v_lshr_v2i64_31: 1541; GFX9: ; %bb.0: 1542; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1543; GFX9-NEXT: v_lshrrev_b64 v[0:1], 31, v[0:1] 1544; GFX9-NEXT: v_lshrrev_b64 v[2:3], 31, v[2:3] 1545; GFX9-NEXT: s_setpc_b64 s[30:31] 1546; 1547; GFX10PLUS-LABEL: v_lshr_v2i64_31: 1548; GFX10PLUS: ; %bb.0: 1549; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1550; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 31, v[0:1] 1551; GFX10PLUS-NEXT: v_lshrrev_b64 v[2:3], 31, v[2:3] 1552; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 1553 %result = lshr <2 x i64> %value, <i64 31, i64 31> 1554 ret <2 x i64> %result 1555} 1556 1557define amdgpu_ps <2 x i64> @s_lshr_v2i64(<2 x i64> inreg %value, <2 x i64> inreg %amount) { 1558; GCN-LABEL: s_lshr_v2i64: 1559; GCN: ; %bb.0: 1560; GCN-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 1561; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], s6 1562; GCN-NEXT: ; return to shader part epilog 1563; 1564; GFX10PLUS-LABEL: s_lshr_v2i64: 1565; GFX10PLUS: ; %bb.0: 1566; GFX10PLUS-NEXT: s_lshr_b64 s[0:1], s[0:1], s4 1567; GFX10PLUS-NEXT: s_lshr_b64 s[2:3], s[2:3], s6 1568; GFX10PLUS-NEXT: ; return to shader part epilog 1569 %result = lshr <2 x i64> %value, %amount 1570 ret <2 x i64> %result 1571} 1572 1573define i65 @v_lshr_i65(i65 %value, i65 %amount) { 1574; GFX6-LABEL: v_lshr_i65: 1575; GFX6: ; %bb.0: 1576; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1577; GFX6-NEXT: v_and_b32_e32 v4, 1, v2 1578; GFX6-NEXT: v_mov_b32_e32 v5, 0 1579; GFX6-NEXT: v_sub_i32_e32 v8, vcc, 64, v3 1580; GFX6-NEXT: v_add_i32_e32 v2, vcc, 0xffffffc0, v3 1581; GFX6-NEXT: v_lshr_b64 v[6:7], v[0:1], v3 1582; GFX6-NEXT: v_lshl_b64 v[8:9], v[4:5], v8 1583; GFX6-NEXT: v_lshr_b64 v[10:11], v[4:5], v3 1584; GFX6-NEXT: v_lshr_b64 v[4:5], v[4:5], v2 1585; GFX6-NEXT: v_or_b32_e32 v6, v6, v8 1586; GFX6-NEXT: v_or_b32_e32 v7, v7, v9 1587; GFX6-NEXT: v_cmp_gt_u32_e32 vcc, 64, v3 1588; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc 1589; GFX6-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc 1590; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 1591; GFX6-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] 1592; GFX6-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5] 1593; GFX6-NEXT: v_cndmask_b32_e32 v2, 0, v10, vcc 1594; GFX6-NEXT: s_setpc_b64 s[30:31] 1595; 1596; GFX8-LABEL: v_lshr_i65: 1597; GFX8: ; %bb.0: 1598; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1599; GFX8-NEXT: v_and_b32_e32 v4, 1, v2 1600; GFX8-NEXT: v_mov_b32_e32 v5, 0 1601; GFX8-NEXT: v_sub_u32_e32 v8, vcc, 64, v3 1602; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0xffffffc0, v3 1603; GFX8-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] 1604; GFX8-NEXT: v_lshlrev_b64 v[8:9], v8, v[4:5] 1605; GFX8-NEXT: v_lshrrev_b64 v[10:11], v3, v[4:5] 1606; GFX8-NEXT: v_lshrrev_b64 v[4:5], v2, v[4:5] 1607; GFX8-NEXT: v_or_b32_e32 v6, v6, v8 1608; GFX8-NEXT: v_or_b32_e32 v7, v7, v9 1609; GFX8-NEXT: v_cmp_gt_u32_e32 vcc, 64, v3 1610; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc 1611; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc 1612; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 1613; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] 1614; GFX8-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5] 1615; GFX8-NEXT: v_cndmask_b32_e32 v2, 0, v10, vcc 1616; GFX8-NEXT: s_setpc_b64 s[30:31] 1617; 1618; GFX9-LABEL: v_lshr_i65: 1619; GFX9: ; %bb.0: 1620; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1621; GFX9-NEXT: v_and_b32_e32 v4, 1, v2 1622; GFX9-NEXT: v_mov_b32_e32 v5, 0 1623; GFX9-NEXT: v_sub_u32_e32 v8, 64, v3 1624; GFX9-NEXT: v_add_u32_e32 v2, 0xffffffc0, v3 1625; GFX9-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] 1626; GFX9-NEXT: v_lshlrev_b64 v[8:9], v8, v[4:5] 1627; GFX9-NEXT: v_lshrrev_b64 v[10:11], v3, v[4:5] 1628; GFX9-NEXT: v_lshrrev_b64 v[4:5], v2, v[4:5] 1629; GFX9-NEXT: v_or_b32_e32 v6, v6, v8 1630; GFX9-NEXT: v_or_b32_e32 v7, v7, v9 1631; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, 64, v3 1632; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v6, vcc 1633; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v7, vcc 1634; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3 1635; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v0, s[4:5] 1636; GFX9-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[4:5] 1637; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v10, vcc 1638; GFX9-NEXT: s_setpc_b64 s[30:31] 1639; 1640; GFX10-LABEL: v_lshr_i65: 1641; GFX10: ; %bb.0: 1642; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1643; GFX10-NEXT: v_mov_b32_e32 v5, 0 1644; GFX10-NEXT: v_and_b32_e32 v4, 1, v2 1645; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v3 1646; GFX10-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v3 1647; GFX10-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] 1648; GFX10-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3 1649; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 0, v3 1650; GFX10-NEXT: v_lshlrev_b64 v[8:9], v2, v[4:5] 1651; GFX10-NEXT: v_lshrrev_b64 v[10:11], v10, v[4:5] 1652; GFX10-NEXT: v_lshrrev_b64 v[4:5], v3, v[4:5] 1653; GFX10-NEXT: v_or_b32_e32 v2, v6, v8 1654; GFX10-NEXT: v_or_b32_e32 v6, v7, v9 1655; GFX10-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo 1656; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v6, vcc_lo 1657; GFX10-NEXT: v_cndmask_b32_e64 v0, v2, v0, s4 1658; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, v1, s4 1659; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc_lo 1660; GFX10-NEXT: s_setpc_b64 s[30:31] 1661; 1662; GFX11-LABEL: v_lshr_i65: 1663; GFX11: ; %bb.0: 1664; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1665; GFX11-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v4, 1, v2 1666; GFX11-NEXT: v_sub_nc_u32_e32 v2, 64, v3 1667; GFX11-NEXT: v_lshrrev_b64 v[6:7], v3, v[0:1] 1668; GFX11-NEXT: v_cmp_gt_u32_e32 vcc_lo, 64, v3 1669; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0, v3 1670; GFX11-NEXT: v_lshlrev_b64 v[8:9], v2, v[4:5] 1671; GFX11-NEXT: v_or_b32_e32 v2, v6, v8 1672; GFX11-NEXT: v_or_b32_e32 v6, v7, v9 1673; GFX11-NEXT: v_add_nc_u32_e32 v10, 0xffffffc0, v3 1674; GFX11-NEXT: v_lshrrev_b64 v[10:11], v10, v[4:5] 1675; GFX11-NEXT: v_lshrrev_b64 v[4:5], v3, v[4:5] 1676; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v6, vcc_lo 1677; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo 1678; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, v1, s0 1679; GFX11-NEXT: v_cndmask_b32_e64 v0, v2, v0, s0 1680; GFX11-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc_lo 1681; GFX11-NEXT: s_setpc_b64 s[30:31] 1682 %result = lshr i65 %value, %amount 1683 ret i65 %result 1684} 1685 1686define i65 @v_lshr_i65_33(i65 %value) { 1687; GFX6-LABEL: v_lshr_i65_33: 1688; GFX6: ; %bb.0: 1689; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1690; GFX6-NEXT: v_mov_b32_e32 v3, v1 1691; GFX6-NEXT: v_and_b32_e32 v0, 1, v2 1692; GFX6-NEXT: v_mov_b32_e32 v1, 0 1693; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 31 1694; GFX6-NEXT: v_lshrrev_b32_e32 v2, 1, v3 1695; GFX6-NEXT: v_or_b32_e32 v0, v2, v0 1696; GFX6-NEXT: v_mov_b32_e32 v2, 0 1697; GFX6-NEXT: s_setpc_b64 s[30:31] 1698; 1699; GFX8-LABEL: v_lshr_i65_33: 1700; GFX8: ; %bb.0: 1701; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1702; GFX8-NEXT: v_mov_b32_e32 v3, v1 1703; GFX8-NEXT: v_and_b32_e32 v0, 1, v2 1704; GFX8-NEXT: v_mov_b32_e32 v1, 0 1705; GFX8-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] 1706; GFX8-NEXT: v_lshrrev_b32_e32 v2, 1, v3 1707; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 1708; GFX8-NEXT: v_mov_b32_e32 v2, 0 1709; GFX8-NEXT: s_setpc_b64 s[30:31] 1710; 1711; GFX9-LABEL: v_lshr_i65_33: 1712; GFX9: ; %bb.0: 1713; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1714; GFX9-NEXT: v_mov_b32_e32 v3, v1 1715; GFX9-NEXT: v_and_b32_e32 v0, 1, v2 1716; GFX9-NEXT: v_mov_b32_e32 v1, 0 1717; GFX9-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] 1718; GFX9-NEXT: v_lshrrev_b32_e32 v2, 1, v3 1719; GFX9-NEXT: v_or_b32_e32 v0, v2, v0 1720; GFX9-NEXT: v_mov_b32_e32 v2, 0 1721; GFX9-NEXT: s_setpc_b64 s[30:31] 1722; 1723; GFX10-LABEL: v_lshr_i65_33: 1724; GFX10: ; %bb.0: 1725; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1726; GFX10-NEXT: v_mov_b32_e32 v3, v1 1727; GFX10-NEXT: v_mov_b32_e32 v1, 0 1728; GFX10-NEXT: v_and_b32_e32 v0, 1, v2 1729; GFX10-NEXT: v_lshrrev_b32_e32 v2, 1, v3 1730; GFX10-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] 1731; GFX10-NEXT: v_or_b32_e32 v0, v2, v0 1732; GFX10-NEXT: v_mov_b32_e32 v2, 0 1733; GFX10-NEXT: s_setpc_b64 s[30:31] 1734; 1735; GFX11-LABEL: v_lshr_i65_33: 1736; GFX11: ; %bb.0: 1737; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1738; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 1, v2 1739; GFX11-NEXT: v_mov_b32_e32 v1, 0 1740; GFX11-NEXT: v_lshrrev_b32_e32 v2, 1, v3 1741; GFX11-NEXT: v_lshlrev_b64 v[0:1], 31, v[0:1] 1742; GFX11-NEXT: v_or_b32_e32 v0, v2, v0 1743; GFX11-NEXT: v_mov_b32_e32 v2, 0 1744; GFX11-NEXT: s_setpc_b64 s[30:31] 1745 %result = lshr i65 %value, 33 1746 ret i65 %result 1747} 1748 1749define amdgpu_ps i65 @s_lshr_i65(i65 inreg %value, i65 inreg %amount) { 1750; GCN-LABEL: s_lshr_i65: 1751; GCN: ; %bb.0: 1752; GCN-NEXT: s_and_b64 s[4:5], s[2:3], 1 1753; GCN-NEXT: s_sub_i32 s10, s3, 64 1754; GCN-NEXT: s_sub_i32 s8, 64, s3 1755; GCN-NEXT: s_cmp_lt_u32 s3, 64 1756; GCN-NEXT: s_cselect_b32 s11, 1, 0 1757; GCN-NEXT: s_cmp_eq_u32 s3, 0 1758; GCN-NEXT: s_cselect_b32 s12, 1, 0 1759; GCN-NEXT: s_lshr_b64 s[6:7], s[4:5], s3 1760; GCN-NEXT: s_lshr_b64 s[2:3], s[0:1], s3 1761; GCN-NEXT: s_lshl_b64 s[8:9], s[4:5], s8 1762; GCN-NEXT: s_or_b64 s[2:3], s[2:3], s[8:9] 1763; GCN-NEXT: s_lshr_b64 s[4:5], s[4:5], s10 1764; GCN-NEXT: s_cmp_lg_u32 s11, 0 1765; GCN-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5] 1766; GCN-NEXT: s_cmp_lg_u32 s12, 0 1767; GCN-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] 1768; GCN-NEXT: s_cmp_lg_u32 s11, 0 1769; GCN-NEXT: s_cselect_b32 s2, s6, 0 1770; GCN-NEXT: ; return to shader part epilog 1771; 1772; GFX10PLUS-LABEL: s_lshr_i65: 1773; GFX10PLUS: ; %bb.0: 1774; GFX10PLUS-NEXT: s_and_b64 s[4:5], s[2:3], 1 1775; GFX10PLUS-NEXT: s_sub_i32 s10, s3, 64 1776; GFX10PLUS-NEXT: s_sub_i32 s2, 64, s3 1777; GFX10PLUS-NEXT: s_cmp_lt_u32 s3, 64 1778; GFX10PLUS-NEXT: s_cselect_b32 s11, 1, 0 1779; GFX10PLUS-NEXT: s_cmp_eq_u32 s3, 0 1780; GFX10PLUS-NEXT: s_cselect_b32 s12, 1, 0 1781; GFX10PLUS-NEXT: s_lshr_b64 s[6:7], s[0:1], s3 1782; GFX10PLUS-NEXT: s_lshl_b64 s[8:9], s[4:5], s2 1783; GFX10PLUS-NEXT: s_lshr_b64 s[2:3], s[4:5], s3 1784; GFX10PLUS-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9] 1785; GFX10PLUS-NEXT: s_lshr_b64 s[4:5], s[4:5], s10 1786; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0 1787; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], s[6:7], s[4:5] 1788; GFX10PLUS-NEXT: s_cmp_lg_u32 s12, 0 1789; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] 1790; GFX10PLUS-NEXT: s_cmp_lg_u32 s11, 0 1791; GFX10PLUS-NEXT: s_cselect_b32 s2, s2, 0 1792; GFX10PLUS-NEXT: ; return to shader part epilog 1793 %result = lshr i65 %value, %amount 1794 ret i65 %result 1795} 1796 1797define amdgpu_ps i65 @s_lshr_i65_33(i65 inreg %value) { 1798; GCN-LABEL: s_lshr_i65_33: 1799; GCN: ; %bb.0: 1800; GCN-NEXT: s_and_b64 s[2:3], s[2:3], 1 1801; GCN-NEXT: s_lshr_b32 s0, s1, 1 1802; GCN-NEXT: s_mov_b32 s1, 0 1803; GCN-NEXT: s_lshl_b64 s[2:3], s[2:3], 31 1804; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] 1805; GCN-NEXT: s_mov_b32 s2, 0 1806; GCN-NEXT: ; return to shader part epilog 1807; 1808; GFX10PLUS-LABEL: s_lshr_i65_33: 1809; GFX10PLUS: ; %bb.0: 1810; GFX10PLUS-NEXT: s_and_b64 s[2:3], s[2:3], 1 1811; GFX10PLUS-NEXT: s_lshr_b32 s0, s1, 1 1812; GFX10PLUS-NEXT: s_mov_b32 s1, 0 1813; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 31 1814; GFX10PLUS-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] 1815; GFX10PLUS-NEXT: s_mov_b32 s2, 0 1816; GFX10PLUS-NEXT: ; return to shader part epilog 1817 %result = lshr i65 %value, 33 1818 ret i65 %result 1819} 1820 1821; FIXME: Argument lowering asserts 1822; define <2 x i65> @v_lshr_v2i65(<2 x i65> %value, <2 x i65> %amount) { 1823; %result = lshr <2 x i65> %value, %amount 1824; ret <2 x i65> %result 1825; } 1826 1827; define amdgpu_ps <2 x i65> @s_lshr_v2i65(<2 x i65> inreg %value, <2 x i65> inreg %amount) { 1828; %result = lshr <2 x i65> %value, %amount 1829; ret <2 x i65> %result 1830; } 1831