1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s 3; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=fiji -o - %s | FileCheck -check-prefix=GFX8 %s 4; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GFX9 %s 5; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s 6; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s 7 8define i7 @v_usubsat_i7(i7 %lhs, i7 %rhs) { 9; GFX6-LABEL: v_usubsat_i7: 10; GFX6: ; %bb.0: 11; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 12; GFX6-NEXT: v_lshlrev_b32_e32 v0, 25, v0 13; GFX6-NEXT: v_lshlrev_b32_e32 v1, 25, v1 14; GFX6-NEXT: v_min_u32_e32 v1, v0, v1 15; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 16; GFX6-NEXT: v_lshrrev_b32_e32 v0, 25, v0 17; GFX6-NEXT: s_setpc_b64 s[30:31] 18; 19; GFX8-LABEL: v_usubsat_i7: 20; GFX8: ; %bb.0: 21; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 22; GFX8-NEXT: v_lshlrev_b16_e32 v0, 9, v0 23; GFX8-NEXT: v_lshlrev_b16_e32 v1, 9, v1 24; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp 25; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v0 26; GFX8-NEXT: s_setpc_b64 s[30:31] 27; 28; GFX9-LABEL: v_usubsat_i7: 29; GFX9: ; %bb.0: 30; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 31; GFX9-NEXT: v_lshlrev_b16_e32 v0, 9, v0 32; GFX9-NEXT: v_lshlrev_b16_e32 v1, 9, v1 33; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp 34; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0 35; GFX9-NEXT: s_setpc_b64 s[30:31] 36; 37; GFX10PLUS-LABEL: v_usubsat_i7: 38; GFX10PLUS: ; %bb.0: 39; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 40; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 9, v0 41; GFX10PLUS-NEXT: v_lshlrev_b16 v1, 9, v1 42; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, v1 clamp 43; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 9, v0 44; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 45 %result = call i7 @llvm.usub.sat.i7(i7 %lhs, i7 %rhs) 46 ret i7 %result 47} 48 49define amdgpu_ps i7 @s_usubsat_i7(i7 inreg %lhs, i7 inreg %rhs) { 50; GFX6-LABEL: s_usubsat_i7: 51; GFX6: ; %bb.0: 52; GFX6-NEXT: s_lshl_b32 s0, s0, 25 53; GFX6-NEXT: s_lshl_b32 s1, s1, 25 54; GFX6-NEXT: s_min_u32 s1, s0, s1 55; GFX6-NEXT: s_sub_i32 s0, s0, s1 56; GFX6-NEXT: s_lshr_b32 s0, s0, 25 57; GFX6-NEXT: ; return to shader part epilog 58; 59; GFX8-LABEL: s_usubsat_i7: 60; GFX8: ; %bb.0: 61; GFX8-NEXT: s_lshl_b32 s1, s1, 9 62; GFX8-NEXT: s_lshl_b32 s0, s0, 9 63; GFX8-NEXT: v_mov_b32_e32 v0, s1 64; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp 65; GFX8-NEXT: v_lshrrev_b16_e32 v0, 9, v0 66; GFX8-NEXT: v_readfirstlane_b32 s0, v0 67; GFX8-NEXT: ; return to shader part epilog 68; 69; GFX9-LABEL: s_usubsat_i7: 70; GFX9: ; %bb.0: 71; GFX9-NEXT: s_lshl_b32 s1, s1, 9 72; GFX9-NEXT: s_lshl_b32 s0, s0, 9 73; GFX9-NEXT: v_mov_b32_e32 v0, s1 74; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp 75; GFX9-NEXT: v_lshrrev_b16_e32 v0, 9, v0 76; GFX9-NEXT: v_readfirstlane_b32 s0, v0 77; GFX9-NEXT: ; return to shader part epilog 78; 79; GFX10PLUS-LABEL: s_usubsat_i7: 80; GFX10PLUS: ; %bb.0: 81; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 9 82; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 9 83; GFX10PLUS-NEXT: v_sub_nc_u16 v0, s0, s1 clamp 84; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 9, v0 85; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 86; GFX10PLUS-NEXT: ; return to shader part epilog 87 %result = call i7 @llvm.usub.sat.i7(i7 %lhs, i7 %rhs) 88 ret i7 %result 89} 90 91define i8 @v_usubsat_i8(i8 %lhs, i8 %rhs) { 92; GFX6-LABEL: v_usubsat_i8: 93; GFX6: ; %bb.0: 94; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 95; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 96; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 97; GFX6-NEXT: v_min_u32_e32 v1, v0, v1 98; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 99; GFX6-NEXT: v_lshrrev_b32_e32 v0, 24, v0 100; GFX6-NEXT: s_setpc_b64 s[30:31] 101; 102; GFX8-LABEL: v_usubsat_i8: 103; GFX8: ; %bb.0: 104; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 105; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 106; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 107; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp 108; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 109; GFX8-NEXT: s_setpc_b64 s[30:31] 110; 111; GFX9-LABEL: v_usubsat_i8: 112; GFX9: ; %bb.0: 113; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 114; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0 115; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 116; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp 117; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 118; GFX9-NEXT: s_setpc_b64 s[30:31] 119; 120; GFX10PLUS-LABEL: v_usubsat_i8: 121; GFX10PLUS: ; %bb.0: 122; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 123; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 8, v0 124; GFX10PLUS-NEXT: v_lshlrev_b16 v1, 8, v1 125; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, v1 clamp 126; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 8, v0 127; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 128 %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs) 129 ret i8 %result 130} 131 132define amdgpu_ps i8 @s_usubsat_i8(i8 inreg %lhs, i8 inreg %rhs) { 133; GFX6-LABEL: s_usubsat_i8: 134; GFX6: ; %bb.0: 135; GFX6-NEXT: s_lshl_b32 s0, s0, 24 136; GFX6-NEXT: s_lshl_b32 s1, s1, 24 137; GFX6-NEXT: s_min_u32 s1, s0, s1 138; GFX6-NEXT: s_sub_i32 s0, s0, s1 139; GFX6-NEXT: s_lshr_b32 s0, s0, 24 140; GFX6-NEXT: ; return to shader part epilog 141; 142; GFX8-LABEL: s_usubsat_i8: 143; GFX8: ; %bb.0: 144; GFX8-NEXT: s_lshl_b32 s1, s1, 8 145; GFX8-NEXT: s_lshl_b32 s0, s0, 8 146; GFX8-NEXT: v_mov_b32_e32 v0, s1 147; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp 148; GFX8-NEXT: v_lshrrev_b16_e32 v0, 8, v0 149; GFX8-NEXT: v_readfirstlane_b32 s0, v0 150; GFX8-NEXT: ; return to shader part epilog 151; 152; GFX9-LABEL: s_usubsat_i8: 153; GFX9: ; %bb.0: 154; GFX9-NEXT: s_lshl_b32 s1, s1, 8 155; GFX9-NEXT: s_lshl_b32 s0, s0, 8 156; GFX9-NEXT: v_mov_b32_e32 v0, s1 157; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp 158; GFX9-NEXT: v_lshrrev_b16_e32 v0, 8, v0 159; GFX9-NEXT: v_readfirstlane_b32 s0, v0 160; GFX9-NEXT: ; return to shader part epilog 161; 162; GFX10PLUS-LABEL: s_usubsat_i8: 163; GFX10PLUS: ; %bb.0: 164; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8 165; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8 166; GFX10PLUS-NEXT: v_sub_nc_u16 v0, s0, s1 clamp 167; GFX10PLUS-NEXT: v_lshrrev_b16 v0, 8, v0 168; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 169; GFX10PLUS-NEXT: ; return to shader part epilog 170 %result = call i8 @llvm.usub.sat.i8(i8 %lhs, i8 %rhs) 171 ret i8 %result 172} 173 174define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) { 175; GFX6-LABEL: v_usubsat_v2i8: 176; GFX6: ; %bb.0: 177; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 178; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0 179; GFX6-NEXT: v_lshrrev_b32_e32 v3, 8, v1 180; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 181; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 182; GFX6-NEXT: v_min_u32_e32 v1, v0, v1 183; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 184; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 185; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 186; GFX6-NEXT: v_min_u32_e32 v2, v1, v2 187; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 188; GFX6-NEXT: v_lshrrev_b32_e32 v1, 24, v1 189; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 24 190; GFX6-NEXT: s_setpc_b64 s[30:31] 191; 192; GFX8-LABEL: v_usubsat_v2i8: 193; GFX8: ; %bb.0: 194; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 195; GFX8-NEXT: v_mov_b32_e32 v2, 8 196; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 197; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 198; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 199; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 200; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp 201; GFX8-NEXT: v_sub_u16_e64 v1, v3, v2 clamp 202; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1 203; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 204; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 205; GFX8-NEXT: s_setpc_b64 s[30:31] 206; 207; GFX9-LABEL: v_usubsat_v2i8: 208; GFX9: ; %bb.0: 209; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 210; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 211; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1 212; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 213; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 214; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0 215; GFX9-NEXT: v_lshl_or_b32 v1, v3, 16, v1 216; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] 217; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 218; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp 219; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] 220; GFX9-NEXT: v_mov_b32_e32 v1, 0xff 221; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 222; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 223; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 224; GFX9-NEXT: s_setpc_b64 s[30:31] 225; 226; GFX10-LABEL: v_usubsat_v2i8: 227; GFX10: ; %bb.0: 228; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 229; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 230; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v1 231; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0 232; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 233; GFX10-NEXT: v_lshl_or_b32 v0, v2, 16, v0 234; GFX10-NEXT: v_lshl_or_b32 v1, v3, 16, v1 235; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] 236; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 237; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp 238; GFX10-NEXT: v_mov_b32_e32 v1, 0xff 239; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] 240; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 241; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 242; GFX10-NEXT: s_setpc_b64 s[30:31] 243; 244; GFX11-LABEL: v_usubsat_v2i8: 245; GFX11: ; %bb.0: 246; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 247; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 248; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 249; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 250; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 251; GFX11-NEXT: v_lshl_or_b32 v0, v2, 16, v0 252; GFX11-NEXT: v_lshl_or_b32 v1, v3, 16, v1 253; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] 254; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 255; GFX11-NEXT: v_pk_sub_u16 v0, v0, v1 clamp 256; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] 257; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 258; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 259; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 260; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 261; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 262; GFX11-NEXT: s_setpc_b64 s[30:31] 263 %lhs = bitcast i16 %lhs.arg to <2 x i8> 264 %rhs = bitcast i16 %rhs.arg to <2 x i8> 265 %result = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs) 266 %cast.result = bitcast <2 x i8> %result to i16 267 ret i16 %cast.result 268} 269 270define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) { 271; GFX6-LABEL: s_usubsat_v2i8: 272; GFX6: ; %bb.0: 273; GFX6-NEXT: s_lshr_b32 s2, s0, 8 274; GFX6-NEXT: s_lshr_b32 s3, s1, 8 275; GFX6-NEXT: s_lshl_b32 s0, s0, 24 276; GFX6-NEXT: s_lshl_b32 s1, s1, 24 277; GFX6-NEXT: s_min_u32 s1, s0, s1 278; GFX6-NEXT: s_sub_i32 s0, s0, s1 279; GFX6-NEXT: s_lshl_b32 s1, s2, 24 280; GFX6-NEXT: s_lshl_b32 s2, s3, 24 281; GFX6-NEXT: s_min_u32 s2, s1, s2 282; GFX6-NEXT: s_sub_i32 s1, s1, s2 283; GFX6-NEXT: s_lshr_b32 s1, s1, 24 284; GFX6-NEXT: v_mov_b32_e32 v0, s0 285; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24 286; GFX6-NEXT: v_readfirstlane_b32 s0, v0 287; GFX6-NEXT: ; return to shader part epilog 288; 289; GFX8-LABEL: s_usubsat_v2i8: 290; GFX8: ; %bb.0: 291; GFX8-NEXT: s_lshr_b32 s3, s1, 8 292; GFX8-NEXT: s_lshl_b32 s1, s1, 8 293; GFX8-NEXT: s_lshr_b32 s2, s0, 8 294; GFX8-NEXT: s_lshl_b32 s0, s0, 8 295; GFX8-NEXT: v_mov_b32_e32 v0, s1 296; GFX8-NEXT: s_lshl_b32 s1, s3, 8 297; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp 298; GFX8-NEXT: s_lshl_b32 s0, s2, 8 299; GFX8-NEXT: v_mov_b32_e32 v1, s1 300; GFX8-NEXT: v_sub_u16_e64 v1, s0, v1 clamp 301; GFX8-NEXT: v_lshrrev_b16_e32 v1, 8, v1 302; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 303; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 304; GFX8-NEXT: v_readfirstlane_b32 s0, v0 305; GFX8-NEXT: ; return to shader part epilog 306; 307; GFX9-LABEL: s_usubsat_v2i8: 308; GFX9: ; %bb.0: 309; GFX9-NEXT: s_lshr_b32 s2, s0, 8 310; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 311; GFX9-NEXT: s_lshr_b32 s3, s1, 8 312; GFX9-NEXT: s_lshr_b32 s2, s0, 16 313; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s3 314; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 315; GFX9-NEXT: s_lshl_b32 s2, s2, 8 316; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 317; GFX9-NEXT: s_lshr_b32 s2, s1, 16 318; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 319; GFX9-NEXT: s_lshl_b32 s2, s2, 8 320; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s2 321; GFX9-NEXT: v_mov_b32_e32 v0, s1 322; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp 323; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] 324; GFX9-NEXT: v_mov_b32_e32 v1, 0xff 325; GFX9-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 326; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 327; GFX9-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 328; GFX9-NEXT: v_readfirstlane_b32 s0, v0 329; GFX9-NEXT: ; return to shader part epilog 330; 331; GFX10-LABEL: s_usubsat_v2i8: 332; GFX10: ; %bb.0: 333; GFX10-NEXT: s_lshr_b32 s2, s0, 8 334; GFX10-NEXT: s_lshr_b32 s3, s1, 8 335; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 336; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 337; GFX10-NEXT: s_lshr_b32 s2, s0, 16 338; GFX10-NEXT: s_lshr_b32 s3, s1, 16 339; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 340; GFX10-NEXT: s_lshl_b32 s2, s2, 8 341; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 342; GFX10-NEXT: s_lshl_b32 s3, s3, 8 343; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 344; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s3 345; GFX10-NEXT: v_mov_b32_e32 v1, 0xff 346; GFX10-NEXT: v_pk_sub_u16 v0, s0, s1 clamp 347; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] 348; GFX10-NEXT: v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 349; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD 350; GFX10-NEXT: v_readfirstlane_b32 s0, v0 351; GFX10-NEXT: ; return to shader part epilog 352; 353; GFX11-LABEL: s_usubsat_v2i8: 354; GFX11: ; %bb.0: 355; GFX11-NEXT: s_lshr_b32 s2, s0, 8 356; GFX11-NEXT: s_lshr_b32 s3, s1, 8 357; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2 358; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3 359; GFX11-NEXT: s_lshr_b32 s2, s0, 16 360; GFX11-NEXT: s_lshr_b32 s3, s1, 16 361; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008 362; GFX11-NEXT: s_lshl_b32 s2, s2, 8 363; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008 364; GFX11-NEXT: s_lshl_b32 s3, s3, 8 365; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s2 366; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s3 367; GFX11-NEXT: v_pk_sub_u16 v0, s0, s1 clamp 368; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] 369; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 370; GFX11-NEXT: v_and_b32_e32 v0, 0xff, v0 371; GFX11-NEXT: v_and_b32_e32 v1, 0xff, v1 372; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 373; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 374; GFX11-NEXT: v_readfirstlane_b32 s0, v0 375; GFX11-NEXT: ; return to shader part epilog 376 %lhs = bitcast i16 %lhs.arg to <2 x i8> 377 %rhs = bitcast i16 %rhs.arg to <2 x i8> 378 %result = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> %lhs, <2 x i8> %rhs) 379 %cast.result = bitcast <2 x i8> %result to i16 380 ret i16 %cast.result 381} 382 383define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) { 384; GFX6-LABEL: v_usubsat_v4i8: 385; GFX6: ; %bb.0: 386; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 387; GFX6-NEXT: v_lshrrev_b32_e32 v2, 8, v0 388; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 389; GFX6-NEXT: v_lshrrev_b32_e32 v4, 24, v0 390; GFX6-NEXT: v_lshrrev_b32_e32 v5, 8, v1 391; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v1 392; GFX6-NEXT: v_lshrrev_b32_e32 v7, 24, v1 393; GFX6-NEXT: v_lshlrev_b32_e32 v0, 24, v0 394; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v1 395; GFX6-NEXT: v_min_u32_e32 v1, v0, v1 396; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 397; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v2 398; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v5 399; GFX6-NEXT: v_min_u32_e32 v2, v1, v2 400; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 401; GFX6-NEXT: v_lshlrev_b32_e32 v2, 24, v3 402; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v6 403; GFX6-NEXT: v_min_u32_e32 v3, v2, v3 404; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 405; GFX6-NEXT: v_lshlrev_b32_e32 v3, 24, v4 406; GFX6-NEXT: v_lshlrev_b32_e32 v4, 24, v7 407; GFX6-NEXT: v_min_u32_e32 v4, v3, v4 408; GFX6-NEXT: v_lshrrev_b32_e32 v1, 24, v1 409; GFX6-NEXT: v_lshrrev_b32_e32 v2, 24, v2 410; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 411; GFX6-NEXT: v_lshrrev_b32_e32 v3, 24, v3 412; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 24 413; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2 414; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 415; GFX6-NEXT: v_lshlrev_b32_e32 v1, 24, v3 416; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 417; GFX6-NEXT: s_setpc_b64 s[30:31] 418; 419; GFX8-LABEL: v_usubsat_v4i8: 420; GFX8: ; %bb.0: 421; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 422; GFX8-NEXT: v_mov_b32_e32 v2, 8 423; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v2, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 424; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v0 425; GFX8-NEXT: v_lshrrev_b32_e32 v5, 24, v0 426; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 427; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v1 428; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v1 429; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 430; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 431; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp 432; GFX8-NEXT: v_sub_u16_e64 v1, v3, v2 clamp 433; GFX8-NEXT: v_lshlrev_b16_e32 v2, 8, v4 434; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v6 435; GFX8-NEXT: v_sub_u16_e64 v2, v2, v3 clamp 436; GFX8-NEXT: v_lshlrev_b16_e32 v3, 8, v5 437; GFX8-NEXT: v_lshlrev_b16_e32 v4, 8, v7 438; GFX8-NEXT: v_sub_u16_e64 v3, v3, v4 clamp 439; GFX8-NEXT: v_mov_b32_e32 v4, 0xff 440; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 441; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 442; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 443; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 444; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 445; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 446; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 447; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 448; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1 449; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 450; GFX8-NEXT: s_setpc_b64 s[30:31] 451; 452; GFX9-LABEL: v_usubsat_v4i8: 453; GFX9: ; %bb.0: 454; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 455; GFX9-NEXT: v_lshrrev_b32_e32 v3, 24, v0 456; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0 457; GFX9-NEXT: v_lshrrev_b32_e32 v4, 8, v1 458; GFX9-NEXT: v_and_b32_e32 v6, 0xffff, v0 459; GFX9-NEXT: v_alignbit_b32 v0, v3, v0, 16 460; GFX9-NEXT: v_and_b32_e32 v3, 0xffff, v1 461; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 462; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v6 463; GFX9-NEXT: v_lshl_or_b32 v3, v4, 16, v3 464; GFX9-NEXT: v_alignbit_b32 v1, v5, v1, 16 465; GFX9-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] 466; GFX9-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] 467; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] 468; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 469; GFX9-NEXT: v_pk_sub_u16 v2, v2, v3 clamp 470; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp 471; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v2 op_sel_hi:[0,1] 472; GFX9-NEXT: v_mov_b32_e32 v3, 8 473; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] 474; GFX9-NEXT: v_mov_b32_e32 v2, 0xff 475; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 476; GFX9-NEXT: v_and_or_b32 v1, v1, v2, v3 477; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v0 478; GFX9-NEXT: v_mov_b32_e32 v3, 24 479; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 480; GFX9-NEXT: v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 481; GFX9-NEXT: v_or3_b32 v0, v1, v2, v0 482; GFX9-NEXT: s_setpc_b64 s[30:31] 483; 484; GFX10-LABEL: v_usubsat_v4i8: 485; GFX10: ; %bb.0: 486; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 487; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v0 488; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 489; GFX10-NEXT: v_and_b32_e32 v4, 0xffff, v0 490; GFX10-NEXT: v_lshrrev_b32_e32 v5, 8, v1 491; GFX10-NEXT: v_and_b32_e32 v6, 0xffff, v1 492; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v1 493; GFX10-NEXT: v_alignbit_b32 v0, v3, v0, 16 494; GFX10-NEXT: v_lshl_or_b32 v2, v2, 16, v4 495; GFX10-NEXT: v_mov_b32_e32 v4, 24 496; GFX10-NEXT: v_lshl_or_b32 v3, v5, 16, v6 497; GFX10-NEXT: v_alignbit_b32 v1, v7, v1, 16 498; GFX10-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] 499; GFX10-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] 500; GFX10-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] 501; GFX10-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 502; GFX10-NEXT: v_pk_sub_u16 v2, v2, v3 clamp 503; GFX10-NEXT: v_pk_sub_u16 v0, v0, v1 clamp 504; GFX10-NEXT: v_mov_b32_e32 v1, 8 505; GFX10-NEXT: v_pk_lshrrev_b16 v2, 8, v2 op_sel_hi:[0,1] 506; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] 507; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 508; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v0 509; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 510; GFX10-NEXT: v_and_or_b32 v1, 0xff, v2, v1 511; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 512; GFX10-NEXT: v_or3_b32 v0, v1, v2, v0 513; GFX10-NEXT: s_setpc_b64 s[30:31] 514; 515; GFX11-LABEL: v_usubsat_v4i8: 516; GFX11: ; %bb.0: 517; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 518; GFX11-NEXT: v_lshrrev_b32_e32 v2, 8, v0 519; GFX11-NEXT: v_lshrrev_b32_e32 v3, 8, v1 520; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v0 521; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v1 522; GFX11-NEXT: v_lshrrev_b32_e32 v6, 24, v0 523; GFX11-NEXT: v_lshrrev_b32_e32 v7, 24, v1 524; GFX11-NEXT: v_lshl_or_b32 v2, v2, 16, v4 525; GFX11-NEXT: v_lshl_or_b32 v3, v3, 16, v5 526; GFX11-NEXT: v_alignbit_b32 v0, v6, v0, 16 527; GFX11-NEXT: v_alignbit_b32 v1, v7, v1, 16 528; GFX11-NEXT: v_pk_lshlrev_b16 v2, 8, v2 op_sel_hi:[0,1] 529; GFX11-NEXT: v_pk_lshlrev_b16 v3, 8, v3 op_sel_hi:[0,1] 530; GFX11-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] 531; GFX11-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] 532; GFX11-NEXT: v_pk_sub_u16 v2, v2, v3 clamp 533; GFX11-NEXT: v_pk_sub_u16 v0, v0, v1 clamp 534; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v2 op_sel_hi:[0,1] 535; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] 536; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 8 537; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v0 538; GFX11-NEXT: v_bfe_u32 v0, v0, 16, 8 539; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 540; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3 541; GFX11-NEXT: v_lshlrev_b32_e32 v0, 24, v0 542; GFX11-NEXT: v_and_or_b32 v1, 0xff, v1, v2 543; GFX11-NEXT: v_or3_b32 v0, v1, v3, v0 544; GFX11-NEXT: s_setpc_b64 s[30:31] 545 %lhs = bitcast i32 %lhs.arg to <4 x i8> 546 %rhs = bitcast i32 %rhs.arg to <4 x i8> 547 %result = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs) 548 %cast.result = bitcast <4 x i8> %result to i32 549 ret i32 %cast.result 550} 551 552define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) { 553; GFX6-LABEL: s_usubsat_v4i8: 554; GFX6: ; %bb.0: 555; GFX6-NEXT: s_lshr_b32 s2, s0, 8 556; GFX6-NEXT: s_lshr_b32 s3, s0, 16 557; GFX6-NEXT: s_lshr_b32 s4, s0, 24 558; GFX6-NEXT: s_lshr_b32 s5, s1, 8 559; GFX6-NEXT: s_lshr_b32 s6, s1, 16 560; GFX6-NEXT: s_lshr_b32 s7, s1, 24 561; GFX6-NEXT: s_lshl_b32 s0, s0, 24 562; GFX6-NEXT: s_lshl_b32 s1, s1, 24 563; GFX6-NEXT: s_min_u32 s1, s0, s1 564; GFX6-NEXT: s_sub_i32 s0, s0, s1 565; GFX6-NEXT: s_lshl_b32 s1, s2, 24 566; GFX6-NEXT: s_lshl_b32 s2, s5, 24 567; GFX6-NEXT: s_min_u32 s2, s1, s2 568; GFX6-NEXT: s_sub_i32 s1, s1, s2 569; GFX6-NEXT: s_lshl_b32 s2, s3, 24 570; GFX6-NEXT: s_lshl_b32 s3, s6, 24 571; GFX6-NEXT: s_min_u32 s3, s2, s3 572; GFX6-NEXT: s_sub_i32 s2, s2, s3 573; GFX6-NEXT: s_lshl_b32 s3, s4, 24 574; GFX6-NEXT: s_lshl_b32 s4, s7, 24 575; GFX6-NEXT: s_min_u32 s4, s3, s4 576; GFX6-NEXT: s_lshr_b32 s1, s1, 24 577; GFX6-NEXT: s_lshr_b32 s2, s2, 24 578; GFX6-NEXT: s_sub_i32 s3, s3, s4 579; GFX6-NEXT: v_mov_b32_e32 v0, s0 580; GFX6-NEXT: s_lshr_b32 s3, s3, 24 581; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 24 582; GFX6-NEXT: s_lshl_b32 s0, s2, 16 583; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 584; GFX6-NEXT: s_lshl_b32 s0, s3, 24 585; GFX6-NEXT: v_or_b32_e32 v0, s0, v0 586; GFX6-NEXT: v_readfirstlane_b32 s0, v0 587; GFX6-NEXT: ; return to shader part epilog 588; 589; GFX8-LABEL: s_usubsat_v4i8: 590; GFX8: ; %bb.0: 591; GFX8-NEXT: s_lshr_b32 s5, s1, 8 592; GFX8-NEXT: s_lshr_b32 s6, s1, 16 593; GFX8-NEXT: s_lshr_b32 s7, s1, 24 594; GFX8-NEXT: s_lshl_b32 s1, s1, 8 595; GFX8-NEXT: s_lshr_b32 s2, s0, 8 596; GFX8-NEXT: s_lshr_b32 s3, s0, 16 597; GFX8-NEXT: s_lshr_b32 s4, s0, 24 598; GFX8-NEXT: s_lshl_b32 s0, s0, 8 599; GFX8-NEXT: v_mov_b32_e32 v0, s1 600; GFX8-NEXT: s_lshl_b32 s1, s5, 8 601; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp 602; GFX8-NEXT: s_lshl_b32 s0, s2, 8 603; GFX8-NEXT: v_mov_b32_e32 v1, s1 604; GFX8-NEXT: v_sub_u16_e64 v1, s0, v1 clamp 605; GFX8-NEXT: s_lshl_b32 s1, s6, 8 606; GFX8-NEXT: v_mov_b32_e32 v4, 0xff 607; GFX8-NEXT: s_lshl_b32 s0, s3, 8 608; GFX8-NEXT: v_mov_b32_e32 v2, s1 609; GFX8-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 610; GFX8-NEXT: v_sub_u16_e64 v2, s0, v2 clamp 611; GFX8-NEXT: s_lshl_b32 s1, s7, 8 612; GFX8-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 613; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 614; GFX8-NEXT: s_lshl_b32 s0, s4, 8 615; GFX8-NEXT: v_mov_b32_e32 v3, s1 616; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 617; GFX8-NEXT: v_and_b32_sdwa v1, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 618; GFX8-NEXT: v_sub_u16_e64 v3, s0, v3 clamp 619; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1 620; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 621; GFX8-NEXT: v_and_b32_sdwa v1, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD 622; GFX8-NEXT: v_lshlrev_b32_e32 v1, 24, v1 623; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 624; GFX8-NEXT: v_readfirstlane_b32 s0, v0 625; GFX8-NEXT: ; return to shader part epilog 626; 627; GFX9-LABEL: s_usubsat_v4i8: 628; GFX9: ; %bb.0: 629; GFX9-NEXT: s_lshr_b32 s2, s0, 8 630; GFX9-NEXT: s_lshr_b32 s3, s0, 16 631; GFX9-NEXT: s_lshr_b32 s4, s0, 24 632; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s2 633; GFX9-NEXT: s_pack_ll_b32_b16 s2, s3, s4 634; GFX9-NEXT: s_lshr_b32 s4, s0, 16 635; GFX9-NEXT: s_lshl_b32 s0, s0, 0x80008 636; GFX9-NEXT: s_lshl_b32 s4, s4, 8 637; GFX9-NEXT: s_lshr_b32 s5, s1, 8 638; GFX9-NEXT: s_pack_ll_b32_b16 s0, s0, s4 639; GFX9-NEXT: s_lshr_b32 s4, s2, 16 640; GFX9-NEXT: s_lshr_b32 s6, s1, 16 641; GFX9-NEXT: s_lshr_b32 s7, s1, 24 642; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s5 643; GFX9-NEXT: s_lshl_b32 s2, s2, 0x80008 644; GFX9-NEXT: s_lshl_b32 s4, s4, 8 645; GFX9-NEXT: s_pack_ll_b32_b16 s2, s2, s4 646; GFX9-NEXT: s_lshr_b32 s4, s1, 16 647; GFX9-NEXT: s_pack_ll_b32_b16 s3, s6, s7 648; GFX9-NEXT: s_lshl_b32 s1, s1, 0x80008 649; GFX9-NEXT: s_lshl_b32 s4, s4, 8 650; GFX9-NEXT: s_pack_ll_b32_b16 s1, s1, s4 651; GFX9-NEXT: s_lshr_b32 s4, s3, 16 652; GFX9-NEXT: s_lshl_b32 s3, s3, 0x80008 653; GFX9-NEXT: s_lshl_b32 s4, s4, 8 654; GFX9-NEXT: s_pack_ll_b32_b16 s3, s3, s4 655; GFX9-NEXT: v_mov_b32_e32 v0, s1 656; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp 657; GFX9-NEXT: v_mov_b32_e32 v1, s3 658; GFX9-NEXT: v_pk_sub_u16 v1, s2, v1 clamp 659; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] 660; GFX9-NEXT: v_mov_b32_e32 v3, 8 661; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] 662; GFX9-NEXT: v_mov_b32_e32 v2, 0xff 663; GFX9-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 664; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v3 665; GFX9-NEXT: v_and_b32_e32 v2, 0xff, v1 666; GFX9-NEXT: v_mov_b32_e32 v3, 24 667; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 668; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 669; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 670; GFX9-NEXT: v_readfirstlane_b32 s0, v0 671; GFX9-NEXT: ; return to shader part epilog 672; 673; GFX10-LABEL: s_usubsat_v4i8: 674; GFX10: ; %bb.0: 675; GFX10-NEXT: s_lshr_b32 s2, s0, 8 676; GFX10-NEXT: s_lshr_b32 s3, s0, 16 677; GFX10-NEXT: s_lshr_b32 s4, s0, 24 678; GFX10-NEXT: s_lshr_b32 s5, s1, 8 679; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s2 680; GFX10-NEXT: s_pack_ll_b32_b16 s2, s3, s4 681; GFX10-NEXT: s_lshr_b32 s6, s1, 16 682; GFX10-NEXT: s_lshr_b32 s7, s1, 24 683; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s5 684; GFX10-NEXT: s_lshr_b32 s4, s0, 16 685; GFX10-NEXT: s_lshr_b32 s5, s2, 16 686; GFX10-NEXT: s_pack_ll_b32_b16 s3, s6, s7 687; GFX10-NEXT: s_lshl_b32 s0, s0, 0x80008 688; GFX10-NEXT: s_lshl_b32 s4, s4, 8 689; GFX10-NEXT: s_lshl_b32 s2, s2, 0x80008 690; GFX10-NEXT: s_lshl_b32 s5, s5, 8 691; GFX10-NEXT: s_pack_ll_b32_b16 s0, s0, s4 692; GFX10-NEXT: s_pack_ll_b32_b16 s2, s2, s5 693; GFX10-NEXT: s_lshr_b32 s4, s1, 16 694; GFX10-NEXT: s_lshr_b32 s5, s3, 16 695; GFX10-NEXT: s_lshl_b32 s1, s1, 0x80008 696; GFX10-NEXT: s_lshl_b32 s4, s4, 8 697; GFX10-NEXT: s_lshl_b32 s3, s3, 0x80008 698; GFX10-NEXT: s_lshl_b32 s5, s5, 8 699; GFX10-NEXT: s_pack_ll_b32_b16 s1, s1, s4 700; GFX10-NEXT: s_pack_ll_b32_b16 s3, s3, s5 701; GFX10-NEXT: v_pk_sub_u16 v0, s0, s1 clamp 702; GFX10-NEXT: v_pk_sub_u16 v1, s2, s3 clamp 703; GFX10-NEXT: v_mov_b32_e32 v2, 8 704; GFX10-NEXT: v_mov_b32_e32 v4, 24 705; GFX10-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] 706; GFX10-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] 707; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 708; GFX10-NEXT: v_and_b32_e32 v3, 0xff, v1 709; GFX10-NEXT: v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 710; GFX10-NEXT: v_and_or_b32 v0, 0xff, v0, v2 711; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3 712; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 713; GFX10-NEXT: v_readfirstlane_b32 s0, v0 714; GFX10-NEXT: ; return to shader part epilog 715; 716; GFX11-LABEL: s_usubsat_v4i8: 717; GFX11: ; %bb.0: 718; GFX11-NEXT: s_lshr_b32 s2, s0, 8 719; GFX11-NEXT: s_lshr_b32 s3, s0, 24 720; GFX11-NEXT: s_lshr_b32 s4, s1, 8 721; GFX11-NEXT: s_lshr_b32 s5, s1, 24 722; GFX11-NEXT: s_pack_ll_b32_b16 s2, s0, s2 723; GFX11-NEXT: s_pack_hl_b32_b16 s0, s0, s3 724; GFX11-NEXT: s_pack_ll_b32_b16 s3, s1, s4 725; GFX11-NEXT: s_lshr_b32 s4, s2, 16 726; GFX11-NEXT: s_pack_hl_b32_b16 s1, s1, s5 727; GFX11-NEXT: s_lshr_b32 s5, s3, 16 728; GFX11-NEXT: s_lshl_b32 s2, s2, 0x80008 729; GFX11-NEXT: s_lshl_b32 s4, s4, 8 730; GFX11-NEXT: s_lshl_b32 s3, s3, 0x80008 731; GFX11-NEXT: s_lshl_b32 s5, s5, 8 732; GFX11-NEXT: s_pack_ll_b32_b16 s2, s2, s4 733; GFX11-NEXT: s_pack_ll_b32_b16 s3, s3, s5 734; GFX11-NEXT: s_lshr_b32 s4, s0, 16 735; GFX11-NEXT: s_lshr_b32 s5, s1, 16 736; GFX11-NEXT: v_pk_sub_u16 v0, s2, s3 clamp 737; GFX11-NEXT: s_lshl_b32 s0, s0, 0x80008 738; GFX11-NEXT: s_lshl_b32 s4, s4, 8 739; GFX11-NEXT: s_lshl_b32 s1, s1, 0x80008 740; GFX11-NEXT: s_lshl_b32 s2, s5, 8 741; GFX11-NEXT: s_pack_ll_b32_b16 s0, s0, s4 742; GFX11-NEXT: s_pack_ll_b32_b16 s1, s1, s2 743; GFX11-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] 744; GFX11-NEXT: v_pk_sub_u16 v1, s0, s1 clamp 745; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 8 746; GFX11-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] 747; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 748; GFX11-NEXT: v_and_b32_e32 v3, 0xff, v1 749; GFX11-NEXT: v_bfe_u32 v1, v1, 16, 8 750; GFX11-NEXT: v_and_or_b32 v0, 0xff, v0, v2 751; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3 752; GFX11-NEXT: v_lshlrev_b32_e32 v1, 24, v1 753; GFX11-NEXT: v_or3_b32 v0, v0, v2, v1 754; GFX11-NEXT: v_readfirstlane_b32 s0, v0 755; GFX11-NEXT: ; return to shader part epilog 756 %lhs = bitcast i32 %lhs.arg to <4 x i8> 757 %rhs = bitcast i32 %rhs.arg to <4 x i8> 758 %result = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> %lhs, <4 x i8> %rhs) 759 %cast.result = bitcast <4 x i8> %result to i32 760 ret i32 %cast.result 761} 762 763define i24 @v_usubsat_i24(i24 %lhs, i24 %rhs) { 764; GFX6-LABEL: v_usubsat_i24: 765; GFX6: ; %bb.0: 766; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 767; GFX6-NEXT: v_lshlrev_b32_e32 v0, 8, v0 768; GFX6-NEXT: v_lshlrev_b32_e32 v1, 8, v1 769; GFX6-NEXT: v_min_u32_e32 v1, v0, v1 770; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 771; GFX6-NEXT: v_lshrrev_b32_e32 v0, 8, v0 772; GFX6-NEXT: s_setpc_b64 s[30:31] 773; 774; GFX8-LABEL: v_usubsat_i24: 775; GFX8: ; %bb.0: 776; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 777; GFX8-NEXT: v_lshlrev_b32_e32 v0, 8, v0 778; GFX8-NEXT: v_lshlrev_b32_e32 v1, 8, v1 779; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v1 clamp 780; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 781; GFX8-NEXT: s_setpc_b64 s[30:31] 782; 783; GFX9-LABEL: v_usubsat_i24: 784; GFX9: ; %bb.0: 785; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 786; GFX9-NEXT: v_lshlrev_b32_e32 v0, 8, v0 787; GFX9-NEXT: v_lshlrev_b32_e32 v1, 8, v1 788; GFX9-NEXT: v_sub_u32_e64 v0, v0, v1 clamp 789; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 790; GFX9-NEXT: s_setpc_b64 s[30:31] 791; 792; GFX10PLUS-LABEL: v_usubsat_i24: 793; GFX10PLUS: ; %bb.0: 794; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 795; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v0, 8, v0 796; GFX10PLUS-NEXT: v_lshlrev_b32_e32 v1, 8, v1 797; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, v1 clamp 798; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, 8, v0 799; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 800 %result = call i24 @llvm.usub.sat.i24(i24 %lhs, i24 %rhs) 801 ret i24 %result 802} 803 804define amdgpu_ps i24 @s_usubsat_i24(i24 inreg %lhs, i24 inreg %rhs) { 805; GFX6-LABEL: s_usubsat_i24: 806; GFX6: ; %bb.0: 807; GFX6-NEXT: s_lshl_b32 s0, s0, 8 808; GFX6-NEXT: s_lshl_b32 s1, s1, 8 809; GFX6-NEXT: s_min_u32 s1, s0, s1 810; GFX6-NEXT: s_sub_i32 s0, s0, s1 811; GFX6-NEXT: s_lshr_b32 s0, s0, 8 812; GFX6-NEXT: ; return to shader part epilog 813; 814; GFX8-LABEL: s_usubsat_i24: 815; GFX8: ; %bb.0: 816; GFX8-NEXT: s_lshl_b32 s1, s1, 8 817; GFX8-NEXT: s_lshl_b32 s0, s0, 8 818; GFX8-NEXT: v_mov_b32_e32 v0, s1 819; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s0, v0 clamp 820; GFX8-NEXT: v_lshrrev_b32_e32 v0, 8, v0 821; GFX8-NEXT: v_readfirstlane_b32 s0, v0 822; GFX8-NEXT: ; return to shader part epilog 823; 824; GFX9-LABEL: s_usubsat_i24: 825; GFX9: ; %bb.0: 826; GFX9-NEXT: s_lshl_b32 s1, s1, 8 827; GFX9-NEXT: s_lshl_b32 s0, s0, 8 828; GFX9-NEXT: v_mov_b32_e32 v0, s1 829; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp 830; GFX9-NEXT: v_lshrrev_b32_e32 v0, 8, v0 831; GFX9-NEXT: v_readfirstlane_b32 s0, v0 832; GFX9-NEXT: ; return to shader part epilog 833; 834; GFX10PLUS-LABEL: s_usubsat_i24: 835; GFX10PLUS: ; %bb.0: 836; GFX10PLUS-NEXT: s_lshl_b32 s0, s0, 8 837; GFX10PLUS-NEXT: s_lshl_b32 s1, s1, 8 838; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, s0, s1 clamp 839; GFX10PLUS-NEXT: v_lshrrev_b32_e32 v0, 8, v0 840; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 841; GFX10PLUS-NEXT: ; return to shader part epilog 842 %result = call i24 @llvm.usub.sat.i24(i24 %lhs, i24 %rhs) 843 ret i24 %result 844} 845 846define i32 @v_usubsat_i32(i32 %lhs, i32 %rhs) { 847; GFX6-LABEL: v_usubsat_i32: 848; GFX6: ; %bb.0: 849; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 850; GFX6-NEXT: v_min_u32_e32 v1, v0, v1 851; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 852; GFX6-NEXT: s_setpc_b64 s[30:31] 853; 854; GFX8-LABEL: v_usubsat_i32: 855; GFX8: ; %bb.0: 856; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 857; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v1 clamp 858; GFX8-NEXT: s_setpc_b64 s[30:31] 859; 860; GFX9-LABEL: v_usubsat_i32: 861; GFX9: ; %bb.0: 862; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 863; GFX9-NEXT: v_sub_u32_e64 v0, v0, v1 clamp 864; GFX9-NEXT: s_setpc_b64 s[30:31] 865; 866; GFX10PLUS-LABEL: v_usubsat_i32: 867; GFX10PLUS: ; %bb.0: 868; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 869; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, v1 clamp 870; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 871 %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs) 872 ret i32 %result 873} 874 875define amdgpu_ps i32 @s_usubsat_i32(i32 inreg %lhs, i32 inreg %rhs) { 876; GFX6-LABEL: s_usubsat_i32: 877; GFX6: ; %bb.0: 878; GFX6-NEXT: s_min_u32 s1, s0, s1 879; GFX6-NEXT: s_sub_i32 s0, s0, s1 880; GFX6-NEXT: ; return to shader part epilog 881; 882; GFX8-LABEL: s_usubsat_i32: 883; GFX8: ; %bb.0: 884; GFX8-NEXT: v_mov_b32_e32 v0, s1 885; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s0, v0 clamp 886; GFX8-NEXT: v_readfirstlane_b32 s0, v0 887; GFX8-NEXT: ; return to shader part epilog 888; 889; GFX9-LABEL: s_usubsat_i32: 890; GFX9: ; %bb.0: 891; GFX9-NEXT: v_mov_b32_e32 v0, s1 892; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp 893; GFX9-NEXT: v_readfirstlane_b32 s0, v0 894; GFX9-NEXT: ; return to shader part epilog 895; 896; GFX10PLUS-LABEL: s_usubsat_i32: 897; GFX10PLUS: ; %bb.0: 898; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, s0, s1 clamp 899; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 900; GFX10PLUS-NEXT: ; return to shader part epilog 901 %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs) 902 ret i32 %result 903} 904 905define amdgpu_ps float @usubsat_i32_sv(i32 inreg %lhs, i32 %rhs) { 906; GFX6-LABEL: usubsat_i32_sv: 907; GFX6: ; %bb.0: 908; GFX6-NEXT: v_min_u32_e32 v0, s0, v0 909; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 910; GFX6-NEXT: ; return to shader part epilog 911; 912; GFX8-LABEL: usubsat_i32_sv: 913; GFX8: ; %bb.0: 914; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], s0, v0 clamp 915; GFX8-NEXT: ; return to shader part epilog 916; 917; GFX9-LABEL: usubsat_i32_sv: 918; GFX9: ; %bb.0: 919; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp 920; GFX9-NEXT: ; return to shader part epilog 921; 922; GFX10PLUS-LABEL: usubsat_i32_sv: 923; GFX10PLUS: ; %bb.0: 924; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, s0, v0 clamp 925; GFX10PLUS-NEXT: ; return to shader part epilog 926 %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs) 927 %cast = bitcast i32 %result to float 928 ret float %cast 929} 930 931define amdgpu_ps float @usubsat_i32_vs(i32 %lhs, i32 inreg %rhs) { 932; GFX6-LABEL: usubsat_i32_vs: 933; GFX6: ; %bb.0: 934; GFX6-NEXT: v_min_u32_e32 v1, s0, v0 935; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 936; GFX6-NEXT: ; return to shader part epilog 937; 938; GFX8-LABEL: usubsat_i32_vs: 939; GFX8: ; %bb.0: 940; GFX8-NEXT: v_sub_u32_e64 v0, s[0:1], v0, s0 clamp 941; GFX8-NEXT: ; return to shader part epilog 942; 943; GFX9-LABEL: usubsat_i32_vs: 944; GFX9: ; %bb.0: 945; GFX9-NEXT: v_sub_u32_e64 v0, v0, s0 clamp 946; GFX9-NEXT: ; return to shader part epilog 947; 948; GFX10PLUS-LABEL: usubsat_i32_vs: 949; GFX10PLUS: ; %bb.0: 950; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, s0 clamp 951; GFX10PLUS-NEXT: ; return to shader part epilog 952 %result = call i32 @llvm.usub.sat.i32(i32 %lhs, i32 %rhs) 953 %cast = bitcast i32 %result to float 954 ret float %cast 955} 956 957define <2 x i32> @v_usubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { 958; GFX6-LABEL: v_usubsat_v2i32: 959; GFX6: ; %bb.0: 960; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 961; GFX6-NEXT: v_min_u32_e32 v2, v0, v2 962; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 963; GFX6-NEXT: v_min_u32_e32 v2, v1, v3 964; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 965; GFX6-NEXT: s_setpc_b64 s[30:31] 966; 967; GFX8-LABEL: v_usubsat_v2i32: 968; GFX8: ; %bb.0: 969; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 970; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v2 clamp 971; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v3 clamp 972; GFX8-NEXT: s_setpc_b64 s[30:31] 973; 974; GFX9-LABEL: v_usubsat_v2i32: 975; GFX9: ; %bb.0: 976; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 977; GFX9-NEXT: v_sub_u32_e64 v0, v0, v2 clamp 978; GFX9-NEXT: v_sub_u32_e64 v1, v1, v3 clamp 979; GFX9-NEXT: s_setpc_b64 s[30:31] 980; 981; GFX10PLUS-LABEL: v_usubsat_v2i32: 982; GFX10PLUS: ; %bb.0: 983; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 984; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, v2 clamp 985; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, v1, v3 clamp 986; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 987 %result = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) 988 ret <2 x i32> %result 989} 990 991define amdgpu_ps <2 x i32> @s_usubsat_v2i32(<2 x i32> inreg %lhs, <2 x i32> inreg %rhs) { 992; GFX6-LABEL: s_usubsat_v2i32: 993; GFX6: ; %bb.0: 994; GFX6-NEXT: s_min_u32 s2, s0, s2 995; GFX6-NEXT: s_sub_i32 s0, s0, s2 996; GFX6-NEXT: s_min_u32 s2, s1, s3 997; GFX6-NEXT: s_sub_i32 s1, s1, s2 998; GFX6-NEXT: ; return to shader part epilog 999; 1000; GFX8-LABEL: s_usubsat_v2i32: 1001; GFX8: ; %bb.0: 1002; GFX8-NEXT: v_mov_b32_e32 v0, s2 1003; GFX8-NEXT: v_mov_b32_e32 v1, s3 1004; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], s0, v0 clamp 1005; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s1, v1 clamp 1006; GFX8-NEXT: v_readfirstlane_b32 s0, v0 1007; GFX8-NEXT: v_readfirstlane_b32 s1, v1 1008; GFX8-NEXT: ; return to shader part epilog 1009; 1010; GFX9-LABEL: s_usubsat_v2i32: 1011; GFX9: ; %bb.0: 1012; GFX9-NEXT: v_mov_b32_e32 v0, s2 1013; GFX9-NEXT: v_mov_b32_e32 v1, s3 1014; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp 1015; GFX9-NEXT: v_sub_u32_e64 v1, s1, v1 clamp 1016; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1017; GFX9-NEXT: v_readfirstlane_b32 s1, v1 1018; GFX9-NEXT: ; return to shader part epilog 1019; 1020; GFX10PLUS-LABEL: s_usubsat_v2i32: 1021; GFX10PLUS: ; %bb.0: 1022; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, s0, s2 clamp 1023; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, s1, s3 clamp 1024; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 1025; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 1026; GFX10PLUS-NEXT: ; return to shader part epilog 1027 %result = call <2 x i32> @llvm.usub.sat.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) 1028 ret <2 x i32> %result 1029} 1030 1031define <3 x i32> @v_usubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { 1032; GFX6-LABEL: v_usubsat_v3i32: 1033; GFX6: ; %bb.0: 1034; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1035; GFX6-NEXT: v_min_u32_e32 v3, v0, v3 1036; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v3 1037; GFX6-NEXT: v_min_u32_e32 v3, v1, v4 1038; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v3 1039; GFX6-NEXT: v_min_u32_e32 v3, v2, v5 1040; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 1041; GFX6-NEXT: s_setpc_b64 s[30:31] 1042; 1043; GFX8-LABEL: v_usubsat_v3i32: 1044; GFX8: ; %bb.0: 1045; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1046; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v3 clamp 1047; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v4 clamp 1048; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v5 clamp 1049; GFX8-NEXT: s_setpc_b64 s[30:31] 1050; 1051; GFX9-LABEL: v_usubsat_v3i32: 1052; GFX9: ; %bb.0: 1053; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1054; GFX9-NEXT: v_sub_u32_e64 v0, v0, v3 clamp 1055; GFX9-NEXT: v_sub_u32_e64 v1, v1, v4 clamp 1056; GFX9-NEXT: v_sub_u32_e64 v2, v2, v5 clamp 1057; GFX9-NEXT: s_setpc_b64 s[30:31] 1058; 1059; GFX10PLUS-LABEL: v_usubsat_v3i32: 1060; GFX10PLUS: ; %bb.0: 1061; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1062; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, v3 clamp 1063; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, v1, v4 clamp 1064; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v2, v2, v5 clamp 1065; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 1066 %result = call <3 x i32> @llvm.usub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) 1067 ret <3 x i32> %result 1068} 1069 1070define amdgpu_ps <3 x i32> @s_usubsat_v3i32(<3 x i32> inreg %lhs, <3 x i32> inreg %rhs) { 1071; GFX6-LABEL: s_usubsat_v3i32: 1072; GFX6: ; %bb.0: 1073; GFX6-NEXT: s_min_u32 s3, s0, s3 1074; GFX6-NEXT: s_sub_i32 s0, s0, s3 1075; GFX6-NEXT: s_min_u32 s3, s1, s4 1076; GFX6-NEXT: s_sub_i32 s1, s1, s3 1077; GFX6-NEXT: s_min_u32 s3, s2, s5 1078; GFX6-NEXT: s_sub_i32 s2, s2, s3 1079; GFX6-NEXT: ; return to shader part epilog 1080; 1081; GFX8-LABEL: s_usubsat_v3i32: 1082; GFX8: ; %bb.0: 1083; GFX8-NEXT: v_mov_b32_e32 v0, s3 1084; GFX8-NEXT: v_mov_b32_e32 v1, s4 1085; GFX8-NEXT: v_mov_b32_e32 v2, s5 1086; GFX8-NEXT: v_sub_u32_e64 v0, s[6:7], s0, v0 clamp 1087; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s1, v1 clamp 1088; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s2, v2 clamp 1089; GFX8-NEXT: v_readfirstlane_b32 s0, v0 1090; GFX8-NEXT: v_readfirstlane_b32 s1, v1 1091; GFX8-NEXT: v_readfirstlane_b32 s2, v2 1092; GFX8-NEXT: ; return to shader part epilog 1093; 1094; GFX9-LABEL: s_usubsat_v3i32: 1095; GFX9: ; %bb.0: 1096; GFX9-NEXT: v_mov_b32_e32 v0, s3 1097; GFX9-NEXT: v_mov_b32_e32 v1, s4 1098; GFX9-NEXT: v_mov_b32_e32 v2, s5 1099; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp 1100; GFX9-NEXT: v_sub_u32_e64 v1, s1, v1 clamp 1101; GFX9-NEXT: v_sub_u32_e64 v2, s2, v2 clamp 1102; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1103; GFX9-NEXT: v_readfirstlane_b32 s1, v1 1104; GFX9-NEXT: v_readfirstlane_b32 s2, v2 1105; GFX9-NEXT: ; return to shader part epilog 1106; 1107; GFX10PLUS-LABEL: s_usubsat_v3i32: 1108; GFX10PLUS: ; %bb.0: 1109; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, s0, s3 clamp 1110; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, s1, s4 clamp 1111; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v2, s2, s5 clamp 1112; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 1113; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 1114; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 1115; GFX10PLUS-NEXT: ; return to shader part epilog 1116 %result = call <3 x i32> @llvm.usub.sat.v3i32(<3 x i32> %lhs, <3 x i32> %rhs) 1117 ret <3 x i32> %result 1118} 1119 1120define <4 x i32> @v_usubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { 1121; GFX6-LABEL: v_usubsat_v4i32: 1122; GFX6: ; %bb.0: 1123; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1124; GFX6-NEXT: v_min_u32_e32 v4, v0, v4 1125; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 1126; GFX6-NEXT: v_min_u32_e32 v4, v1, v5 1127; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 1128; GFX6-NEXT: v_min_u32_e32 v4, v2, v6 1129; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 1130; GFX6-NEXT: v_min_u32_e32 v4, v3, v7 1131; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 1132; GFX6-NEXT: s_setpc_b64 s[30:31] 1133; 1134; GFX8-LABEL: v_usubsat_v4i32: 1135; GFX8: ; %bb.0: 1136; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1137; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v4 clamp 1138; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v5 clamp 1139; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v6 clamp 1140; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v7 clamp 1141; GFX8-NEXT: s_setpc_b64 s[30:31] 1142; 1143; GFX9-LABEL: v_usubsat_v4i32: 1144; GFX9: ; %bb.0: 1145; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1146; GFX9-NEXT: v_sub_u32_e64 v0, v0, v4 clamp 1147; GFX9-NEXT: v_sub_u32_e64 v1, v1, v5 clamp 1148; GFX9-NEXT: v_sub_u32_e64 v2, v2, v6 clamp 1149; GFX9-NEXT: v_sub_u32_e64 v3, v3, v7 clamp 1150; GFX9-NEXT: s_setpc_b64 s[30:31] 1151; 1152; GFX10PLUS-LABEL: v_usubsat_v4i32: 1153; GFX10PLUS: ; %bb.0: 1154; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1155; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, v4 clamp 1156; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, v1, v5 clamp 1157; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v2, v2, v6 clamp 1158; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v3, v3, v7 clamp 1159; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 1160 %result = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) 1161 ret <4 x i32> %result 1162} 1163 1164define amdgpu_ps <4 x i32> @s_usubsat_v4i32(<4 x i32> inreg %lhs, <4 x i32> inreg %rhs) { 1165; GFX6-LABEL: s_usubsat_v4i32: 1166; GFX6: ; %bb.0: 1167; GFX6-NEXT: s_min_u32 s4, s0, s4 1168; GFX6-NEXT: s_sub_i32 s0, s0, s4 1169; GFX6-NEXT: s_min_u32 s4, s1, s5 1170; GFX6-NEXT: s_sub_i32 s1, s1, s4 1171; GFX6-NEXT: s_min_u32 s4, s2, s6 1172; GFX6-NEXT: s_sub_i32 s2, s2, s4 1173; GFX6-NEXT: s_min_u32 s4, s3, s7 1174; GFX6-NEXT: s_sub_i32 s3, s3, s4 1175; GFX6-NEXT: ; return to shader part epilog 1176; 1177; GFX8-LABEL: s_usubsat_v4i32: 1178; GFX8: ; %bb.0: 1179; GFX8-NEXT: v_mov_b32_e32 v0, s4 1180; GFX8-NEXT: v_mov_b32_e32 v1, s5 1181; GFX8-NEXT: v_mov_b32_e32 v2, s6 1182; GFX8-NEXT: v_mov_b32_e32 v3, s7 1183; GFX8-NEXT: v_sub_u32_e64 v0, s[8:9], s0, v0 clamp 1184; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s1, v1 clamp 1185; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s2, v2 clamp 1186; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s3, v3 clamp 1187; GFX8-NEXT: v_readfirstlane_b32 s0, v0 1188; GFX8-NEXT: v_readfirstlane_b32 s1, v1 1189; GFX8-NEXT: v_readfirstlane_b32 s2, v2 1190; GFX8-NEXT: v_readfirstlane_b32 s3, v3 1191; GFX8-NEXT: ; return to shader part epilog 1192; 1193; GFX9-LABEL: s_usubsat_v4i32: 1194; GFX9: ; %bb.0: 1195; GFX9-NEXT: v_mov_b32_e32 v0, s4 1196; GFX9-NEXT: v_mov_b32_e32 v1, s5 1197; GFX9-NEXT: v_mov_b32_e32 v2, s6 1198; GFX9-NEXT: v_mov_b32_e32 v3, s7 1199; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp 1200; GFX9-NEXT: v_sub_u32_e64 v1, s1, v1 clamp 1201; GFX9-NEXT: v_sub_u32_e64 v2, s2, v2 clamp 1202; GFX9-NEXT: v_sub_u32_e64 v3, s3, v3 clamp 1203; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1204; GFX9-NEXT: v_readfirstlane_b32 s1, v1 1205; GFX9-NEXT: v_readfirstlane_b32 s2, v2 1206; GFX9-NEXT: v_readfirstlane_b32 s3, v3 1207; GFX9-NEXT: ; return to shader part epilog 1208; 1209; GFX10PLUS-LABEL: s_usubsat_v4i32: 1210; GFX10PLUS: ; %bb.0: 1211; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, s0, s4 clamp 1212; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, s1, s5 clamp 1213; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v2, s2, s6 clamp 1214; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v3, s3, s7 clamp 1215; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 1216; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 1217; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 1218; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3 1219; GFX10PLUS-NEXT: ; return to shader part epilog 1220 %result = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) 1221 ret <4 x i32> %result 1222} 1223 1224define <5 x i32> @v_usubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) { 1225; GFX6-LABEL: v_usubsat_v5i32: 1226; GFX6: ; %bb.0: 1227; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1228; GFX6-NEXT: v_min_u32_e32 v5, v0, v5 1229; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 1230; GFX6-NEXT: v_min_u32_e32 v5, v1, v6 1231; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v5 1232; GFX6-NEXT: v_min_u32_e32 v5, v2, v7 1233; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v5 1234; GFX6-NEXT: v_min_u32_e32 v5, v3, v8 1235; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v5 1236; GFX6-NEXT: v_min_u32_e32 v5, v4, v9 1237; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v5 1238; GFX6-NEXT: s_setpc_b64 s[30:31] 1239; 1240; GFX8-LABEL: v_usubsat_v5i32: 1241; GFX8: ; %bb.0: 1242; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1243; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v5 clamp 1244; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v6 clamp 1245; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v7 clamp 1246; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v8 clamp 1247; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v4, v9 clamp 1248; GFX8-NEXT: s_setpc_b64 s[30:31] 1249; 1250; GFX9-LABEL: v_usubsat_v5i32: 1251; GFX9: ; %bb.0: 1252; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1253; GFX9-NEXT: v_sub_u32_e64 v0, v0, v5 clamp 1254; GFX9-NEXT: v_sub_u32_e64 v1, v1, v6 clamp 1255; GFX9-NEXT: v_sub_u32_e64 v2, v2, v7 clamp 1256; GFX9-NEXT: v_sub_u32_e64 v3, v3, v8 clamp 1257; GFX9-NEXT: v_sub_u32_e64 v4, v4, v9 clamp 1258; GFX9-NEXT: s_setpc_b64 s[30:31] 1259; 1260; GFX10PLUS-LABEL: v_usubsat_v5i32: 1261; GFX10PLUS: ; %bb.0: 1262; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1263; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, v0, v5 clamp 1264; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, v1, v6 clamp 1265; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v2, v2, v7 clamp 1266; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v3, v3, v8 clamp 1267; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v4, v4, v9 clamp 1268; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 1269 %result = call <5 x i32> @llvm.usub.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs) 1270 ret <5 x i32> %result 1271} 1272 1273define amdgpu_ps <5 x i32> @s_usubsat_v5i32(<5 x i32> inreg %lhs, <5 x i32> inreg %rhs) { 1274; GFX6-LABEL: s_usubsat_v5i32: 1275; GFX6: ; %bb.0: 1276; GFX6-NEXT: s_min_u32 s5, s0, s5 1277; GFX6-NEXT: s_sub_i32 s0, s0, s5 1278; GFX6-NEXT: s_min_u32 s5, s1, s6 1279; GFX6-NEXT: s_sub_i32 s1, s1, s5 1280; GFX6-NEXT: s_min_u32 s5, s2, s7 1281; GFX6-NEXT: s_sub_i32 s2, s2, s5 1282; GFX6-NEXT: s_min_u32 s5, s3, s8 1283; GFX6-NEXT: s_sub_i32 s3, s3, s5 1284; GFX6-NEXT: s_min_u32 s5, s4, s9 1285; GFX6-NEXT: s_sub_i32 s4, s4, s5 1286; GFX6-NEXT: ; return to shader part epilog 1287; 1288; GFX8-LABEL: s_usubsat_v5i32: 1289; GFX8: ; %bb.0: 1290; GFX8-NEXT: v_mov_b32_e32 v0, s5 1291; GFX8-NEXT: v_mov_b32_e32 v1, s6 1292; GFX8-NEXT: v_mov_b32_e32 v2, s7 1293; GFX8-NEXT: v_mov_b32_e32 v3, s8 1294; GFX8-NEXT: v_mov_b32_e32 v4, s9 1295; GFX8-NEXT: v_sub_u32_e64 v0, s[10:11], s0, v0 clamp 1296; GFX8-NEXT: v_sub_u32_e64 v1, s[0:1], s1, v1 clamp 1297; GFX8-NEXT: v_sub_u32_e64 v2, s[0:1], s2, v2 clamp 1298; GFX8-NEXT: v_sub_u32_e64 v3, s[0:1], s3, v3 clamp 1299; GFX8-NEXT: v_sub_u32_e64 v4, s[0:1], s4, v4 clamp 1300; GFX8-NEXT: v_readfirstlane_b32 s0, v0 1301; GFX8-NEXT: v_readfirstlane_b32 s1, v1 1302; GFX8-NEXT: v_readfirstlane_b32 s2, v2 1303; GFX8-NEXT: v_readfirstlane_b32 s3, v3 1304; GFX8-NEXT: v_readfirstlane_b32 s4, v4 1305; GFX8-NEXT: ; return to shader part epilog 1306; 1307; GFX9-LABEL: s_usubsat_v5i32: 1308; GFX9: ; %bb.0: 1309; GFX9-NEXT: v_mov_b32_e32 v0, s5 1310; GFX9-NEXT: v_mov_b32_e32 v1, s6 1311; GFX9-NEXT: v_mov_b32_e32 v2, s7 1312; GFX9-NEXT: v_mov_b32_e32 v3, s8 1313; GFX9-NEXT: v_mov_b32_e32 v4, s9 1314; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp 1315; GFX9-NEXT: v_sub_u32_e64 v1, s1, v1 clamp 1316; GFX9-NEXT: v_sub_u32_e64 v2, s2, v2 clamp 1317; GFX9-NEXT: v_sub_u32_e64 v3, s3, v3 clamp 1318; GFX9-NEXT: v_sub_u32_e64 v4, s4, v4 clamp 1319; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1320; GFX9-NEXT: v_readfirstlane_b32 s1, v1 1321; GFX9-NEXT: v_readfirstlane_b32 s2, v2 1322; GFX9-NEXT: v_readfirstlane_b32 s3, v3 1323; GFX9-NEXT: v_readfirstlane_b32 s4, v4 1324; GFX9-NEXT: ; return to shader part epilog 1325; 1326; GFX10PLUS-LABEL: s_usubsat_v5i32: 1327; GFX10PLUS: ; %bb.0: 1328; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, s0, s5 clamp 1329; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, s1, s6 clamp 1330; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v2, s2, s7 clamp 1331; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v3, s3, s8 clamp 1332; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v4, s4, s9 clamp 1333; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 1334; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 1335; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 1336; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3 1337; GFX10PLUS-NEXT: v_readfirstlane_b32 s4, v4 1338; GFX10PLUS-NEXT: ; return to shader part epilog 1339 %result = call <5 x i32> @llvm.usub.sat.v5i32(<5 x i32> %lhs, <5 x i32> %rhs) 1340 ret <5 x i32> %result 1341} 1342 1343define <16 x i32> @v_usubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { 1344; GFX6-LABEL: v_usubsat_v16i32: 1345; GFX6: ; %bb.0: 1346; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1347; GFX6-NEXT: v_min_u32_e32 v16, v0, v16 1348; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v16 1349; GFX6-NEXT: v_min_u32_e32 v16, v1, v17 1350; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v16 1351; GFX6-NEXT: v_min_u32_e32 v16, v2, v18 1352; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v16 1353; GFX6-NEXT: v_min_u32_e32 v16, v3, v19 1354; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v16 1355; GFX6-NEXT: buffer_load_dword v16, off, s[0:3], s32 1356; GFX6-NEXT: v_min_u32_e32 v17, v4, v20 1357; GFX6-NEXT: v_min_u32_e32 v18, v5, v21 1358; GFX6-NEXT: v_min_u32_e32 v19, v6, v22 1359; GFX6-NEXT: v_min_u32_e32 v20, v7, v23 1360; GFX6-NEXT: v_min_u32_e32 v21, v8, v24 1361; GFX6-NEXT: v_min_u32_e32 v22, v9, v25 1362; GFX6-NEXT: v_min_u32_e32 v23, v10, v26 1363; GFX6-NEXT: v_min_u32_e32 v24, v11, v27 1364; GFX6-NEXT: v_min_u32_e32 v25, v12, v28 1365; GFX6-NEXT: v_min_u32_e32 v26, v13, v29 1366; GFX6-NEXT: v_min_u32_e32 v27, v14, v30 1367; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v17 1368; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v18 1369; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v19 1370; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v20 1371; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v8, v21 1372; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v9, v22 1373; GFX6-NEXT: v_sub_i32_e32 v10, vcc, v10, v23 1374; GFX6-NEXT: v_sub_i32_e32 v11, vcc, v11, v24 1375; GFX6-NEXT: v_sub_i32_e32 v12, vcc, v12, v25 1376; GFX6-NEXT: v_sub_i32_e32 v13, vcc, v13, v26 1377; GFX6-NEXT: v_sub_i32_e32 v14, vcc, v14, v27 1378; GFX6-NEXT: s_waitcnt vmcnt(0) 1379; GFX6-NEXT: v_min_u32_e32 v16, v15, v16 1380; GFX6-NEXT: v_sub_i32_e32 v15, vcc, v15, v16 1381; GFX6-NEXT: s_setpc_b64 s[30:31] 1382; 1383; GFX8-LABEL: v_usubsat_v16i32: 1384; GFX8: ; %bb.0: 1385; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1386; GFX8-NEXT: v_sub_u32_e64 v0, s[4:5], v0, v16 clamp 1387; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 1388; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v1, v17 clamp 1389; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v2, v18 clamp 1390; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v3, v19 clamp 1391; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v4, v20 clamp 1392; GFX8-NEXT: v_sub_u32_e64 v5, s[4:5], v5, v21 clamp 1393; GFX8-NEXT: v_sub_u32_e64 v6, s[4:5], v6, v22 clamp 1394; GFX8-NEXT: v_sub_u32_e64 v7, s[4:5], v7, v23 clamp 1395; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v8, v24 clamp 1396; GFX8-NEXT: v_sub_u32_e64 v9, s[4:5], v9, v25 clamp 1397; GFX8-NEXT: v_sub_u32_e64 v10, s[4:5], v10, v26 clamp 1398; GFX8-NEXT: v_sub_u32_e64 v11, s[4:5], v11, v27 clamp 1399; GFX8-NEXT: v_sub_u32_e64 v12, s[4:5], v12, v28 clamp 1400; GFX8-NEXT: v_sub_u32_e64 v13, s[4:5], v13, v29 clamp 1401; GFX8-NEXT: v_sub_u32_e64 v14, s[4:5], v14, v30 clamp 1402; GFX8-NEXT: s_waitcnt vmcnt(0) 1403; GFX8-NEXT: v_sub_u32_e64 v15, s[4:5], v15, v16 clamp 1404; GFX8-NEXT: s_setpc_b64 s[30:31] 1405; 1406; GFX9-LABEL: v_usubsat_v16i32: 1407; GFX9: ; %bb.0: 1408; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1409; GFX9-NEXT: v_sub_u32_e64 v0, v0, v16 clamp 1410; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 1411; GFX9-NEXT: v_sub_u32_e64 v1, v1, v17 clamp 1412; GFX9-NEXT: v_sub_u32_e64 v2, v2, v18 clamp 1413; GFX9-NEXT: v_sub_u32_e64 v3, v3, v19 clamp 1414; GFX9-NEXT: v_sub_u32_e64 v4, v4, v20 clamp 1415; GFX9-NEXT: v_sub_u32_e64 v5, v5, v21 clamp 1416; GFX9-NEXT: v_sub_u32_e64 v6, v6, v22 clamp 1417; GFX9-NEXT: v_sub_u32_e64 v7, v7, v23 clamp 1418; GFX9-NEXT: v_sub_u32_e64 v8, v8, v24 clamp 1419; GFX9-NEXT: v_sub_u32_e64 v9, v9, v25 clamp 1420; GFX9-NEXT: v_sub_u32_e64 v10, v10, v26 clamp 1421; GFX9-NEXT: v_sub_u32_e64 v11, v11, v27 clamp 1422; GFX9-NEXT: v_sub_u32_e64 v12, v12, v28 clamp 1423; GFX9-NEXT: v_sub_u32_e64 v13, v13, v29 clamp 1424; GFX9-NEXT: v_sub_u32_e64 v14, v14, v30 clamp 1425; GFX9-NEXT: s_waitcnt vmcnt(0) 1426; GFX9-NEXT: v_sub_u32_e64 v15, v15, v16 clamp 1427; GFX9-NEXT: s_setpc_b64 s[30:31] 1428; 1429; GFX10-LABEL: v_usubsat_v16i32: 1430; GFX10: ; %bb.0: 1431; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1432; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 1433; GFX10-NEXT: v_sub_nc_u32_e64 v0, v0, v16 clamp 1434; GFX10-NEXT: v_sub_nc_u32_e64 v1, v1, v17 clamp 1435; GFX10-NEXT: v_sub_nc_u32_e64 v2, v2, v18 clamp 1436; GFX10-NEXT: v_sub_nc_u32_e64 v3, v3, v19 clamp 1437; GFX10-NEXT: v_sub_nc_u32_e64 v4, v4, v20 clamp 1438; GFX10-NEXT: v_sub_nc_u32_e64 v5, v5, v21 clamp 1439; GFX10-NEXT: v_sub_nc_u32_e64 v6, v6, v22 clamp 1440; GFX10-NEXT: v_sub_nc_u32_e64 v7, v7, v23 clamp 1441; GFX10-NEXT: v_sub_nc_u32_e64 v8, v8, v24 clamp 1442; GFX10-NEXT: v_sub_nc_u32_e64 v9, v9, v25 clamp 1443; GFX10-NEXT: v_sub_nc_u32_e64 v10, v10, v26 clamp 1444; GFX10-NEXT: v_sub_nc_u32_e64 v11, v11, v27 clamp 1445; GFX10-NEXT: v_sub_nc_u32_e64 v12, v12, v28 clamp 1446; GFX10-NEXT: v_sub_nc_u32_e64 v13, v13, v29 clamp 1447; GFX10-NEXT: v_sub_nc_u32_e64 v14, v14, v30 clamp 1448; GFX10-NEXT: s_waitcnt vmcnt(0) 1449; GFX10-NEXT: v_sub_nc_u32_e64 v15, v15, v31 clamp 1450; GFX10-NEXT: s_setpc_b64 s[30:31] 1451; 1452; GFX11-LABEL: v_usubsat_v16i32: 1453; GFX11: ; %bb.0: 1454; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1455; GFX11-NEXT: scratch_load_b32 v31, off, s32 1456; GFX11-NEXT: v_sub_nc_u32_e64 v0, v0, v16 clamp 1457; GFX11-NEXT: v_sub_nc_u32_e64 v1, v1, v17 clamp 1458; GFX11-NEXT: v_sub_nc_u32_e64 v2, v2, v18 clamp 1459; GFX11-NEXT: v_sub_nc_u32_e64 v3, v3, v19 clamp 1460; GFX11-NEXT: v_sub_nc_u32_e64 v4, v4, v20 clamp 1461; GFX11-NEXT: v_sub_nc_u32_e64 v5, v5, v21 clamp 1462; GFX11-NEXT: v_sub_nc_u32_e64 v6, v6, v22 clamp 1463; GFX11-NEXT: v_sub_nc_u32_e64 v7, v7, v23 clamp 1464; GFX11-NEXT: v_sub_nc_u32_e64 v8, v8, v24 clamp 1465; GFX11-NEXT: v_sub_nc_u32_e64 v9, v9, v25 clamp 1466; GFX11-NEXT: v_sub_nc_u32_e64 v10, v10, v26 clamp 1467; GFX11-NEXT: v_sub_nc_u32_e64 v11, v11, v27 clamp 1468; GFX11-NEXT: v_sub_nc_u32_e64 v12, v12, v28 clamp 1469; GFX11-NEXT: v_sub_nc_u32_e64 v13, v13, v29 clamp 1470; GFX11-NEXT: v_sub_nc_u32_e64 v14, v14, v30 clamp 1471; GFX11-NEXT: s_waitcnt vmcnt(0) 1472; GFX11-NEXT: v_sub_nc_u32_e64 v15, v15, v31 clamp 1473; GFX11-NEXT: s_setpc_b64 s[30:31] 1474 %result = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) 1475 ret <16 x i32> %result 1476} 1477 1478define amdgpu_ps <16 x i32> @s_usubsat_v16i32(<16 x i32> inreg %lhs, <16 x i32> inreg %rhs) { 1479; GFX6-LABEL: s_usubsat_v16i32: 1480; GFX6: ; %bb.0: 1481; GFX6-NEXT: s_min_u32 s16, s0, s16 1482; GFX6-NEXT: s_sub_i32 s0, s0, s16 1483; GFX6-NEXT: s_min_u32 s16, s1, s17 1484; GFX6-NEXT: s_sub_i32 s1, s1, s16 1485; GFX6-NEXT: s_min_u32 s16, s2, s18 1486; GFX6-NEXT: s_sub_i32 s2, s2, s16 1487; GFX6-NEXT: s_min_u32 s16, s3, s19 1488; GFX6-NEXT: s_sub_i32 s3, s3, s16 1489; GFX6-NEXT: s_min_u32 s16, s4, s20 1490; GFX6-NEXT: s_sub_i32 s4, s4, s16 1491; GFX6-NEXT: s_min_u32 s16, s5, s21 1492; GFX6-NEXT: s_sub_i32 s5, s5, s16 1493; GFX6-NEXT: s_min_u32 s16, s6, s22 1494; GFX6-NEXT: s_sub_i32 s6, s6, s16 1495; GFX6-NEXT: s_min_u32 s16, s7, s23 1496; GFX6-NEXT: s_sub_i32 s7, s7, s16 1497; GFX6-NEXT: s_min_u32 s16, s8, s24 1498; GFX6-NEXT: s_sub_i32 s8, s8, s16 1499; GFX6-NEXT: s_min_u32 s16, s9, s25 1500; GFX6-NEXT: s_sub_i32 s9, s9, s16 1501; GFX6-NEXT: s_min_u32 s16, s10, s26 1502; GFX6-NEXT: s_sub_i32 s10, s10, s16 1503; GFX6-NEXT: s_min_u32 s16, s11, s27 1504; GFX6-NEXT: s_sub_i32 s11, s11, s16 1505; GFX6-NEXT: s_min_u32 s16, s12, s28 1506; GFX6-NEXT: s_sub_i32 s12, s12, s16 1507; GFX6-NEXT: s_min_u32 s16, s13, s29 1508; GFX6-NEXT: s_sub_i32 s13, s13, s16 1509; GFX6-NEXT: s_min_u32 s16, s14, s30 1510; GFX6-NEXT: s_sub_i32 s14, s14, s16 1511; GFX6-NEXT: s_min_u32 s16, s15, s31 1512; GFX6-NEXT: s_sub_i32 s15, s15, s16 1513; GFX6-NEXT: ; return to shader part epilog 1514; 1515; GFX8-LABEL: s_usubsat_v16i32: 1516; GFX8: ; %bb.0: 1517; GFX8-NEXT: v_mov_b32_e32 v0, s16 1518; GFX8-NEXT: v_mov_b32_e32 v1, s17 1519; GFX8-NEXT: v_mov_b32_e32 v2, s18 1520; GFX8-NEXT: v_mov_b32_e32 v3, s19 1521; GFX8-NEXT: v_mov_b32_e32 v4, s20 1522; GFX8-NEXT: v_mov_b32_e32 v5, s21 1523; GFX8-NEXT: v_mov_b32_e32 v6, s22 1524; GFX8-NEXT: v_mov_b32_e32 v7, s23 1525; GFX8-NEXT: v_mov_b32_e32 v8, s24 1526; GFX8-NEXT: v_mov_b32_e32 v9, s25 1527; GFX8-NEXT: v_mov_b32_e32 v10, s26 1528; GFX8-NEXT: v_mov_b32_e32 v11, s27 1529; GFX8-NEXT: v_mov_b32_e32 v12, s28 1530; GFX8-NEXT: v_mov_b32_e32 v13, s29 1531; GFX8-NEXT: v_mov_b32_e32 v14, s30 1532; GFX8-NEXT: v_mov_b32_e32 v15, s31 1533; GFX8-NEXT: v_sub_u32_e64 v0, s[32:33], s0, v0 clamp 1534; GFX8-NEXT: v_sub_u32_e64 v1, s[16:17], s1, v1 clamp 1535; GFX8-NEXT: v_sub_u32_e64 v2, s[16:17], s2, v2 clamp 1536; GFX8-NEXT: v_sub_u32_e64 v3, s[2:3], s3, v3 clamp 1537; GFX8-NEXT: v_sub_u32_e64 v4, s[2:3], s4, v4 clamp 1538; GFX8-NEXT: v_sub_u32_e64 v5, s[2:3], s5, v5 clamp 1539; GFX8-NEXT: v_sub_u32_e64 v6, s[2:3], s6, v6 clamp 1540; GFX8-NEXT: v_sub_u32_e64 v7, s[2:3], s7, v7 clamp 1541; GFX8-NEXT: v_sub_u32_e64 v8, s[2:3], s8, v8 clamp 1542; GFX8-NEXT: v_sub_u32_e64 v9, s[2:3], s9, v9 clamp 1543; GFX8-NEXT: v_sub_u32_e64 v10, s[2:3], s10, v10 clamp 1544; GFX8-NEXT: v_sub_u32_e64 v11, s[2:3], s11, v11 clamp 1545; GFX8-NEXT: v_sub_u32_e64 v12, s[2:3], s12, v12 clamp 1546; GFX8-NEXT: v_sub_u32_e64 v13, s[2:3], s13, v13 clamp 1547; GFX8-NEXT: v_sub_u32_e64 v14, s[2:3], s14, v14 clamp 1548; GFX8-NEXT: v_sub_u32_e64 v15, s[2:3], s15, v15 clamp 1549; GFX8-NEXT: v_readfirstlane_b32 s0, v0 1550; GFX8-NEXT: v_readfirstlane_b32 s1, v1 1551; GFX8-NEXT: v_readfirstlane_b32 s2, v2 1552; GFX8-NEXT: v_readfirstlane_b32 s3, v3 1553; GFX8-NEXT: v_readfirstlane_b32 s4, v4 1554; GFX8-NEXT: v_readfirstlane_b32 s5, v5 1555; GFX8-NEXT: v_readfirstlane_b32 s6, v6 1556; GFX8-NEXT: v_readfirstlane_b32 s7, v7 1557; GFX8-NEXT: v_readfirstlane_b32 s8, v8 1558; GFX8-NEXT: v_readfirstlane_b32 s9, v9 1559; GFX8-NEXT: v_readfirstlane_b32 s10, v10 1560; GFX8-NEXT: v_readfirstlane_b32 s11, v11 1561; GFX8-NEXT: v_readfirstlane_b32 s12, v12 1562; GFX8-NEXT: v_readfirstlane_b32 s13, v13 1563; GFX8-NEXT: v_readfirstlane_b32 s14, v14 1564; GFX8-NEXT: v_readfirstlane_b32 s15, v15 1565; GFX8-NEXT: ; return to shader part epilog 1566; 1567; GFX9-LABEL: s_usubsat_v16i32: 1568; GFX9: ; %bb.0: 1569; GFX9-NEXT: v_mov_b32_e32 v0, s16 1570; GFX9-NEXT: v_mov_b32_e32 v1, s17 1571; GFX9-NEXT: v_mov_b32_e32 v2, s18 1572; GFX9-NEXT: v_mov_b32_e32 v3, s19 1573; GFX9-NEXT: v_mov_b32_e32 v4, s20 1574; GFX9-NEXT: v_mov_b32_e32 v5, s21 1575; GFX9-NEXT: v_mov_b32_e32 v6, s22 1576; GFX9-NEXT: v_mov_b32_e32 v7, s23 1577; GFX9-NEXT: v_mov_b32_e32 v8, s24 1578; GFX9-NEXT: v_mov_b32_e32 v9, s25 1579; GFX9-NEXT: v_mov_b32_e32 v10, s26 1580; GFX9-NEXT: v_mov_b32_e32 v11, s27 1581; GFX9-NEXT: v_mov_b32_e32 v12, s28 1582; GFX9-NEXT: v_mov_b32_e32 v13, s29 1583; GFX9-NEXT: v_mov_b32_e32 v14, s30 1584; GFX9-NEXT: v_mov_b32_e32 v15, s31 1585; GFX9-NEXT: v_sub_u32_e64 v0, s0, v0 clamp 1586; GFX9-NEXT: v_sub_u32_e64 v1, s1, v1 clamp 1587; GFX9-NEXT: v_sub_u32_e64 v2, s2, v2 clamp 1588; GFX9-NEXT: v_sub_u32_e64 v3, s3, v3 clamp 1589; GFX9-NEXT: v_sub_u32_e64 v4, s4, v4 clamp 1590; GFX9-NEXT: v_sub_u32_e64 v5, s5, v5 clamp 1591; GFX9-NEXT: v_sub_u32_e64 v6, s6, v6 clamp 1592; GFX9-NEXT: v_sub_u32_e64 v7, s7, v7 clamp 1593; GFX9-NEXT: v_sub_u32_e64 v8, s8, v8 clamp 1594; GFX9-NEXT: v_sub_u32_e64 v9, s9, v9 clamp 1595; GFX9-NEXT: v_sub_u32_e64 v10, s10, v10 clamp 1596; GFX9-NEXT: v_sub_u32_e64 v11, s11, v11 clamp 1597; GFX9-NEXT: v_sub_u32_e64 v12, s12, v12 clamp 1598; GFX9-NEXT: v_sub_u32_e64 v13, s13, v13 clamp 1599; GFX9-NEXT: v_sub_u32_e64 v14, s14, v14 clamp 1600; GFX9-NEXT: v_sub_u32_e64 v15, s15, v15 clamp 1601; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1602; GFX9-NEXT: v_readfirstlane_b32 s1, v1 1603; GFX9-NEXT: v_readfirstlane_b32 s2, v2 1604; GFX9-NEXT: v_readfirstlane_b32 s3, v3 1605; GFX9-NEXT: v_readfirstlane_b32 s4, v4 1606; GFX9-NEXT: v_readfirstlane_b32 s5, v5 1607; GFX9-NEXT: v_readfirstlane_b32 s6, v6 1608; GFX9-NEXT: v_readfirstlane_b32 s7, v7 1609; GFX9-NEXT: v_readfirstlane_b32 s8, v8 1610; GFX9-NEXT: v_readfirstlane_b32 s9, v9 1611; GFX9-NEXT: v_readfirstlane_b32 s10, v10 1612; GFX9-NEXT: v_readfirstlane_b32 s11, v11 1613; GFX9-NEXT: v_readfirstlane_b32 s12, v12 1614; GFX9-NEXT: v_readfirstlane_b32 s13, v13 1615; GFX9-NEXT: v_readfirstlane_b32 s14, v14 1616; GFX9-NEXT: v_readfirstlane_b32 s15, v15 1617; GFX9-NEXT: ; return to shader part epilog 1618; 1619; GFX10PLUS-LABEL: s_usubsat_v16i32: 1620; GFX10PLUS: ; %bb.0: 1621; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v0, s0, s16 clamp 1622; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v1, s1, s17 clamp 1623; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v2, s2, s18 clamp 1624; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v3, s3, s19 clamp 1625; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v4, s4, s20 clamp 1626; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v5, s5, s21 clamp 1627; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v6, s6, s22 clamp 1628; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v7, s7, s23 clamp 1629; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v8, s8, s24 clamp 1630; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v9, s9, s25 clamp 1631; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v10, s10, s26 clamp 1632; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v11, s11, s27 clamp 1633; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v12, s12, s28 clamp 1634; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v13, s13, s29 clamp 1635; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v14, s14, s30 clamp 1636; GFX10PLUS-NEXT: v_sub_nc_u32_e64 v15, s15, s31 clamp 1637; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 1638; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 1639; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 1640; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3 1641; GFX10PLUS-NEXT: v_readfirstlane_b32 s4, v4 1642; GFX10PLUS-NEXT: v_readfirstlane_b32 s5, v5 1643; GFX10PLUS-NEXT: v_readfirstlane_b32 s6, v6 1644; GFX10PLUS-NEXT: v_readfirstlane_b32 s7, v7 1645; GFX10PLUS-NEXT: v_readfirstlane_b32 s8, v8 1646; GFX10PLUS-NEXT: v_readfirstlane_b32 s9, v9 1647; GFX10PLUS-NEXT: v_readfirstlane_b32 s10, v10 1648; GFX10PLUS-NEXT: v_readfirstlane_b32 s11, v11 1649; GFX10PLUS-NEXT: v_readfirstlane_b32 s12, v12 1650; GFX10PLUS-NEXT: v_readfirstlane_b32 s13, v13 1651; GFX10PLUS-NEXT: v_readfirstlane_b32 s14, v14 1652; GFX10PLUS-NEXT: v_readfirstlane_b32 s15, v15 1653; GFX10PLUS-NEXT: ; return to shader part epilog 1654 %result = call <16 x i32> @llvm.usub.sat.v16i32(<16 x i32> %lhs, <16 x i32> %rhs) 1655 ret <16 x i32> %result 1656} 1657 1658define i16 @v_usubsat_i16(i16 %lhs, i16 %rhs) { 1659; GFX6-LABEL: v_usubsat_i16: 1660; GFX6: ; %bb.0: 1661; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1662; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1663; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1664; GFX6-NEXT: v_min_u32_e32 v1, v0, v1 1665; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 1666; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1667; GFX6-NEXT: s_setpc_b64 s[30:31] 1668; 1669; GFX8-LABEL: v_usubsat_i16: 1670; GFX8: ; %bb.0: 1671; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1672; GFX8-NEXT: v_sub_u16_e64 v0, v0, v1 clamp 1673; GFX8-NEXT: s_setpc_b64 s[30:31] 1674; 1675; GFX9-LABEL: v_usubsat_i16: 1676; GFX9: ; %bb.0: 1677; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1678; GFX9-NEXT: v_sub_u16_e64 v0, v0, v1 clamp 1679; GFX9-NEXT: s_setpc_b64 s[30:31] 1680; 1681; GFX10PLUS-LABEL: v_usubsat_i16: 1682; GFX10PLUS: ; %bb.0: 1683; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1684; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, v1 clamp 1685; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 1686 %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs) 1687 ret i16 %result 1688} 1689 1690define amdgpu_ps i16 @s_usubsat_i16(i16 inreg %lhs, i16 inreg %rhs) { 1691; GFX6-LABEL: s_usubsat_i16: 1692; GFX6: ; %bb.0: 1693; GFX6-NEXT: s_lshl_b32 s0, s0, 16 1694; GFX6-NEXT: s_lshl_b32 s1, s1, 16 1695; GFX6-NEXT: s_min_u32 s1, s0, s1 1696; GFX6-NEXT: s_sub_i32 s0, s0, s1 1697; GFX6-NEXT: s_lshr_b32 s0, s0, 16 1698; GFX6-NEXT: ; return to shader part epilog 1699; 1700; GFX8-LABEL: s_usubsat_i16: 1701; GFX8: ; %bb.0: 1702; GFX8-NEXT: v_mov_b32_e32 v0, s1 1703; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp 1704; GFX8-NEXT: v_readfirstlane_b32 s0, v0 1705; GFX8-NEXT: ; return to shader part epilog 1706; 1707; GFX9-LABEL: s_usubsat_i16: 1708; GFX9: ; %bb.0: 1709; GFX9-NEXT: v_mov_b32_e32 v0, s1 1710; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp 1711; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1712; GFX9-NEXT: ; return to shader part epilog 1713; 1714; GFX10PLUS-LABEL: s_usubsat_i16: 1715; GFX10PLUS: ; %bb.0: 1716; GFX10PLUS-NEXT: v_sub_nc_u16 v0, s0, s1 clamp 1717; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 1718; GFX10PLUS-NEXT: ; return to shader part epilog 1719 %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs) 1720 ret i16 %result 1721} 1722 1723define amdgpu_ps half @usubsat_i16_sv(i16 inreg %lhs, i16 %rhs) { 1724; GFX6-LABEL: usubsat_i16_sv: 1725; GFX6: ; %bb.0: 1726; GFX6-NEXT: s_lshl_b32 s0, s0, 16 1727; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1728; GFX6-NEXT: v_min_u32_e32 v0, s0, v0 1729; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 1730; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1731; GFX6-NEXT: ; return to shader part epilog 1732; 1733; GFX8-LABEL: usubsat_i16_sv: 1734; GFX8: ; %bb.0: 1735; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp 1736; GFX8-NEXT: ; return to shader part epilog 1737; 1738; GFX9-LABEL: usubsat_i16_sv: 1739; GFX9: ; %bb.0: 1740; GFX9-NEXT: v_sub_u16_e64 v0, s0, v0 clamp 1741; GFX9-NEXT: ; return to shader part epilog 1742; 1743; GFX10PLUS-LABEL: usubsat_i16_sv: 1744; GFX10PLUS: ; %bb.0: 1745; GFX10PLUS-NEXT: v_sub_nc_u16 v0, s0, v0 clamp 1746; GFX10PLUS-NEXT: ; return to shader part epilog 1747 %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs) 1748 %cast = bitcast i16 %result to half 1749 ret half %cast 1750} 1751 1752define amdgpu_ps half @usubsat_i16_vs(i16 %lhs, i16 inreg %rhs) { 1753; GFX6-LABEL: usubsat_i16_vs: 1754; GFX6: ; %bb.0: 1755; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1756; GFX6-NEXT: s_lshl_b32 s0, s0, 16 1757; GFX6-NEXT: v_min_u32_e32 v1, s0, v0 1758; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 1759; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1760; GFX6-NEXT: ; return to shader part epilog 1761; 1762; GFX8-LABEL: usubsat_i16_vs: 1763; GFX8: ; %bb.0: 1764; GFX8-NEXT: v_sub_u16_e64 v0, v0, s0 clamp 1765; GFX8-NEXT: ; return to shader part epilog 1766; 1767; GFX9-LABEL: usubsat_i16_vs: 1768; GFX9: ; %bb.0: 1769; GFX9-NEXT: v_sub_u16_e64 v0, v0, s0 clamp 1770; GFX9-NEXT: ; return to shader part epilog 1771; 1772; GFX10PLUS-LABEL: usubsat_i16_vs: 1773; GFX10PLUS: ; %bb.0: 1774; GFX10PLUS-NEXT: v_sub_nc_u16 v0, v0, s0 clamp 1775; GFX10PLUS-NEXT: ; return to shader part epilog 1776 %result = call i16 @llvm.usub.sat.i16(i16 %lhs, i16 %rhs) 1777 %cast = bitcast i16 %result to half 1778 ret half %cast 1779} 1780 1781define <2 x i16> @v_usubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { 1782; GFX6-LABEL: v_usubsat_v2i16: 1783; GFX6: ; %bb.0: 1784; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1785; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1786; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 1787; GFX6-NEXT: v_min_u32_e32 v2, v0, v2 1788; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 1789; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1790; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 1791; GFX6-NEXT: v_min_u32_e32 v2, v1, v2 1792; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 1793; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0 1794; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1795; GFX6-NEXT: s_setpc_b64 s[30:31] 1796; 1797; GFX8-LABEL: v_usubsat_v2i16: 1798; GFX8: ; %bb.0: 1799; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1800; GFX8-NEXT: v_sub_u16_e64 v2, v0, v1 clamp 1801; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1802; GFX8-NEXT: v_or_b32_e32 v0, v2, v0 1803; GFX8-NEXT: s_setpc_b64 s[30:31] 1804; 1805; GFX9-LABEL: v_usubsat_v2i16: 1806; GFX9: ; %bb.0: 1807; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1808; GFX9-NEXT: v_pk_sub_u16 v0, v0, v1 clamp 1809; GFX9-NEXT: s_setpc_b64 s[30:31] 1810; 1811; GFX10PLUS-LABEL: v_usubsat_v2i16: 1812; GFX10PLUS: ; %bb.0: 1813; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1814; GFX10PLUS-NEXT: v_pk_sub_u16 v0, v0, v1 clamp 1815; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 1816 %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) 1817 ret <2 x i16> %result 1818} 1819 1820define amdgpu_ps i32 @s_usubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs) { 1821; GFX6-LABEL: s_usubsat_v2i16: 1822; GFX6: ; %bb.0: 1823; GFX6-NEXT: s_lshl_b32 s0, s0, 16 1824; GFX6-NEXT: s_lshl_b32 s2, s2, 16 1825; GFX6-NEXT: s_min_u32 s2, s0, s2 1826; GFX6-NEXT: s_sub_i32 s0, s0, s2 1827; GFX6-NEXT: s_lshl_b32 s1, s1, 16 1828; GFX6-NEXT: s_lshl_b32 s2, s3, 16 1829; GFX6-NEXT: s_min_u32 s2, s1, s2 1830; GFX6-NEXT: s_sub_i32 s1, s1, s2 1831; GFX6-NEXT: s_lshr_b32 s1, s1, 16 1832; GFX6-NEXT: v_mov_b32_e32 v0, s0 1833; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16 1834; GFX6-NEXT: v_readfirstlane_b32 s0, v0 1835; GFX6-NEXT: ; return to shader part epilog 1836; 1837; GFX8-LABEL: s_usubsat_v2i16: 1838; GFX8: ; %bb.0: 1839; GFX8-NEXT: s_lshr_b32 s2, s0, 16 1840; GFX8-NEXT: s_lshr_b32 s3, s1, 16 1841; GFX8-NEXT: v_mov_b32_e32 v0, s1 1842; GFX8-NEXT: v_mov_b32_e32 v1, s3 1843; GFX8-NEXT: v_mov_b32_e32 v2, s2 1844; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp 1845; GFX8-NEXT: v_sub_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 1846; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 1847; GFX8-NEXT: v_readfirstlane_b32 s0, v0 1848; GFX8-NEXT: ; return to shader part epilog 1849; 1850; GFX9-LABEL: s_usubsat_v2i16: 1851; GFX9: ; %bb.0: 1852; GFX9-NEXT: v_mov_b32_e32 v0, s1 1853; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp 1854; GFX9-NEXT: v_readfirstlane_b32 s0, v0 1855; GFX9-NEXT: ; return to shader part epilog 1856; 1857; GFX10PLUS-LABEL: s_usubsat_v2i16: 1858; GFX10PLUS: ; %bb.0: 1859; GFX10PLUS-NEXT: v_pk_sub_u16 v0, s0, s1 clamp 1860; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 1861; GFX10PLUS-NEXT: ; return to shader part epilog 1862 %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) 1863 %cast = bitcast <2 x i16> %result to i32 1864 ret i32 %cast 1865} 1866 1867define amdgpu_ps float @usubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) { 1868; GFX6-LABEL: usubsat_v2i16_sv: 1869; GFX6: ; %bb.0: 1870; GFX6-NEXT: s_lshl_b32 s0, s0, 16 1871; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1872; GFX6-NEXT: v_min_u32_e32 v0, s0, v0 1873; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 1874; GFX6-NEXT: s_lshl_b32 s0, s1, 16 1875; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1876; GFX6-NEXT: v_min_u32_e32 v1, s0, v1 1877; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v1 1878; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1879; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16 1880; GFX6-NEXT: ; return to shader part epilog 1881; 1882; GFX8-LABEL: usubsat_v2i16_sv: 1883; GFX8: ; %bb.0: 1884; GFX8-NEXT: s_lshr_b32 s1, s0, 16 1885; GFX8-NEXT: v_mov_b32_e32 v2, s1 1886; GFX8-NEXT: v_sub_u16_e64 v1, s0, v0 clamp 1887; GFX8-NEXT: v_sub_u16_sdwa v0, v2, v0 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 1888; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 1889; GFX8-NEXT: ; return to shader part epilog 1890; 1891; GFX9-LABEL: usubsat_v2i16_sv: 1892; GFX9: ; %bb.0: 1893; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp 1894; GFX9-NEXT: ; return to shader part epilog 1895; 1896; GFX10PLUS-LABEL: usubsat_v2i16_sv: 1897; GFX10PLUS: ; %bb.0: 1898; GFX10PLUS-NEXT: v_pk_sub_u16 v0, s0, v0 clamp 1899; GFX10PLUS-NEXT: ; return to shader part epilog 1900 %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) 1901 %cast = bitcast <2 x i16> %result to float 1902 ret float %cast 1903} 1904 1905define amdgpu_ps float @usubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) { 1906; GFX6-LABEL: usubsat_v2i16_vs: 1907; GFX6: ; %bb.0: 1908; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1909; GFX6-NEXT: s_lshl_b32 s0, s0, 16 1910; GFX6-NEXT: v_min_u32_e32 v2, s0, v0 1911; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1912; GFX6-NEXT: s_lshl_b32 s0, s1, 16 1913; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 1914; GFX6-NEXT: v_min_u32_e32 v2, s0, v1 1915; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v2 1916; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1917; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16 1918; GFX6-NEXT: ; return to shader part epilog 1919; 1920; GFX8-LABEL: usubsat_v2i16_vs: 1921; GFX8: ; %bb.0: 1922; GFX8-NEXT: s_lshr_b32 s1, s0, 16 1923; GFX8-NEXT: v_mov_b32_e32 v2, s1 1924; GFX8-NEXT: v_sub_u16_e64 v1, v0, s0 clamp 1925; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD 1926; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 1927; GFX8-NEXT: ; return to shader part epilog 1928; 1929; GFX9-LABEL: usubsat_v2i16_vs: 1930; GFX9: ; %bb.0: 1931; GFX9-NEXT: v_pk_sub_u16 v0, v0, s0 clamp 1932; GFX9-NEXT: ; return to shader part epilog 1933; 1934; GFX10PLUS-LABEL: usubsat_v2i16_vs: 1935; GFX10PLUS: ; %bb.0: 1936; GFX10PLUS-NEXT: v_pk_sub_u16 v0, v0, s0 clamp 1937; GFX10PLUS-NEXT: ; return to shader part epilog 1938 %result = call <2 x i16> @llvm.usub.sat.v2i16(<2 x i16> %lhs, <2 x i16> %rhs) 1939 %cast = bitcast <2 x i16> %result to float 1940 ret float %cast 1941} 1942 1943; FIXME: v3i16 insert/extract 1944; define <3 x i16> @v_usubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) { 1945; %result = call <3 x i16> @llvm.usub.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs) 1946; ret <3 x i16> %result 1947; } 1948 1949; define amdgpu_ps <3 x i16> @s_usubsat_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs) { 1950; %result = call <3 x i16> @llvm.usub.sat.v3i16(<3 x i16> %lhs, <3 x i16> %rhs) 1951; ret <3 x i16> %result 1952; } 1953 1954define <2 x float> @v_usubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { 1955; GFX6-LABEL: v_usubsat_v4i16: 1956; GFX6: ; %bb.0: 1957; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1958; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 1959; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 1960; GFX6-NEXT: v_min_u32_e32 v4, v0, v4 1961; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 1962; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 1963; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5 1964; GFX6-NEXT: v_min_u32_e32 v4, v1, v4 1965; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 1966; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 1967; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v6 1968; GFX6-NEXT: v_min_u32_e32 v4, v2, v4 1969; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 1970; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 1971; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v7 1972; GFX6-NEXT: v_min_u32_e32 v4, v3, v4 1973; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v4 1974; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 1975; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 1976; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16 1977; GFX6-NEXT: v_alignbit_b32 v1, v3, v2, 16 1978; GFX6-NEXT: s_setpc_b64 s[30:31] 1979; 1980; GFX8-LABEL: v_usubsat_v4i16: 1981; GFX8: ; %bb.0: 1982; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1983; GFX8-NEXT: v_sub_u16_e64 v4, v0, v2 clamp 1984; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v2 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1985; GFX8-NEXT: v_sub_u16_e64 v2, v1, v3 clamp 1986; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 1987; GFX8-NEXT: v_or_b32_e32 v0, v4, v0 1988; GFX8-NEXT: v_or_b32_e32 v1, v2, v1 1989; GFX8-NEXT: s_setpc_b64 s[30:31] 1990; 1991; GFX9-LABEL: v_usubsat_v4i16: 1992; GFX9: ; %bb.0: 1993; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 1994; GFX9-NEXT: v_pk_sub_u16 v0, v0, v2 clamp 1995; GFX9-NEXT: v_pk_sub_u16 v1, v1, v3 clamp 1996; GFX9-NEXT: s_setpc_b64 s[30:31] 1997; 1998; GFX10PLUS-LABEL: v_usubsat_v4i16: 1999; GFX10PLUS: ; %bb.0: 2000; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2001; GFX10PLUS-NEXT: v_pk_sub_u16 v0, v0, v2 clamp 2002; GFX10PLUS-NEXT: v_pk_sub_u16 v1, v1, v3 clamp 2003; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 2004 %result = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) 2005 %cast = bitcast <4 x i16> %result to <2 x float> 2006 ret <2 x float> %cast 2007} 2008 2009define amdgpu_ps <2 x i32> @s_usubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inreg %rhs) { 2010; GFX6-LABEL: s_usubsat_v4i16: 2011; GFX6: ; %bb.0: 2012; GFX6-NEXT: s_lshl_b32 s0, s0, 16 2013; GFX6-NEXT: s_lshl_b32 s4, s4, 16 2014; GFX6-NEXT: s_min_u32 s4, s0, s4 2015; GFX6-NEXT: s_sub_i32 s0, s0, s4 2016; GFX6-NEXT: s_lshl_b32 s1, s1, 16 2017; GFX6-NEXT: s_lshl_b32 s4, s5, 16 2018; GFX6-NEXT: s_min_u32 s4, s1, s4 2019; GFX6-NEXT: s_sub_i32 s1, s1, s4 2020; GFX6-NEXT: s_lshl_b32 s2, s2, 16 2021; GFX6-NEXT: s_lshl_b32 s4, s6, 16 2022; GFX6-NEXT: s_min_u32 s4, s2, s4 2023; GFX6-NEXT: s_sub_i32 s2, s2, s4 2024; GFX6-NEXT: s_lshl_b32 s3, s3, 16 2025; GFX6-NEXT: s_lshl_b32 s4, s7, 16 2026; GFX6-NEXT: s_min_u32 s4, s3, s4 2027; GFX6-NEXT: s_sub_i32 s3, s3, s4 2028; GFX6-NEXT: s_lshr_b32 s1, s1, 16 2029; GFX6-NEXT: s_lshr_b32 s3, s3, 16 2030; GFX6-NEXT: v_mov_b32_e32 v0, s0 2031; GFX6-NEXT: v_mov_b32_e32 v1, s2 2032; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16 2033; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16 2034; GFX6-NEXT: v_readfirstlane_b32 s0, v0 2035; GFX6-NEXT: v_readfirstlane_b32 s1, v1 2036; GFX6-NEXT: ; return to shader part epilog 2037; 2038; GFX8-LABEL: s_usubsat_v4i16: 2039; GFX8: ; %bb.0: 2040; GFX8-NEXT: s_lshr_b32 s4, s0, 16 2041; GFX8-NEXT: s_lshr_b32 s6, s2, 16 2042; GFX8-NEXT: s_lshr_b32 s5, s1, 16 2043; GFX8-NEXT: s_lshr_b32 s7, s3, 16 2044; GFX8-NEXT: v_mov_b32_e32 v1, s6 2045; GFX8-NEXT: v_mov_b32_e32 v2, s4 2046; GFX8-NEXT: v_mov_b32_e32 v0, s2 2047; GFX8-NEXT: v_sub_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2048; GFX8-NEXT: v_mov_b32_e32 v2, s3 2049; GFX8-NEXT: v_mov_b32_e32 v3, s7 2050; GFX8-NEXT: v_mov_b32_e32 v4, s5 2051; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp 2052; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp 2053; GFX8-NEXT: v_sub_u16_sdwa v3, v4, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2054; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 2055; GFX8-NEXT: v_or_b32_e32 v1, v2, v3 2056; GFX8-NEXT: v_readfirstlane_b32 s0, v0 2057; GFX8-NEXT: v_readfirstlane_b32 s1, v1 2058; GFX8-NEXT: ; return to shader part epilog 2059; 2060; GFX9-LABEL: s_usubsat_v4i16: 2061; GFX9: ; %bb.0: 2062; GFX9-NEXT: v_mov_b32_e32 v0, s2 2063; GFX9-NEXT: v_mov_b32_e32 v1, s3 2064; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp 2065; GFX9-NEXT: v_pk_sub_u16 v1, s1, v1 clamp 2066; GFX9-NEXT: v_readfirstlane_b32 s0, v0 2067; GFX9-NEXT: v_readfirstlane_b32 s1, v1 2068; GFX9-NEXT: ; return to shader part epilog 2069; 2070; GFX10PLUS-LABEL: s_usubsat_v4i16: 2071; GFX10PLUS: ; %bb.0: 2072; GFX10PLUS-NEXT: v_pk_sub_u16 v0, s0, s2 clamp 2073; GFX10PLUS-NEXT: v_pk_sub_u16 v1, s1, s3 clamp 2074; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 2075; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 2076; GFX10PLUS-NEXT: ; return to shader part epilog 2077 %result = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) 2078 %cast = bitcast <4 x i16> %result to <2 x i32> 2079 ret <2 x i32> %cast 2080} 2081 2082; FIXME 2083; define <5 x i16> @v_usubsat_v5i16(<5 x i16> %lhs, <5 x i16> %rhs) { 2084; %result = call <5 x i16> @llvm.usub.sat.v5i16(<5 x i16> %lhs, <5 x i16> %rhs) 2085; ret <5 x i16> %result 2086; } 2087 2088; define amdgpu_ps <5 x i16> @s_usubsat_v5i16(<5 x i16> inreg %lhs, <5 x i16> inreg %rhs) { 2089; %result = call <5 x i16> @llvm.usub.sat.v5i16(<5 x i16> %lhs, <5 x i16> %rhs) 2090; ret <5 x i16> %result 2091; } 2092 2093define <3 x float> @v_usubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) { 2094; GFX6-LABEL: v_usubsat_v6i16: 2095; GFX6: ; %bb.0: 2096; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2097; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2098; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 2099; GFX6-NEXT: v_min_u32_e32 v6, v0, v6 2100; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v6 2101; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2102; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v7 2103; GFX6-NEXT: v_min_u32_e32 v6, v1, v6 2104; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v6 2105; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2106; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v8 2107; GFX6-NEXT: v_min_u32_e32 v6, v2, v6 2108; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 2109; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2110; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v9 2111; GFX6-NEXT: v_min_u32_e32 v6, v3, v6 2112; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v6 2113; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 2114; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v10 2115; GFX6-NEXT: v_min_u32_e32 v6, v4, v6 2116; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 2117; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 2118; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v11 2119; GFX6-NEXT: v_min_u32_e32 v6, v5, v6 2120; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v6 2121; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 2122; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 2123; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 2124; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16 2125; GFX6-NEXT: v_alignbit_b32 v1, v3, v2, 16 2126; GFX6-NEXT: v_alignbit_b32 v2, v5, v4, 16 2127; GFX6-NEXT: s_setpc_b64 s[30:31] 2128; 2129; GFX8-LABEL: v_usubsat_v6i16: 2130; GFX8: ; %bb.0: 2131; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2132; GFX8-NEXT: v_sub_u16_e64 v6, v0, v3 clamp 2133; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2134; GFX8-NEXT: v_sub_u16_e64 v3, v1, v4 clamp 2135; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2136; GFX8-NEXT: v_sub_u16_e64 v4, v2, v5 clamp 2137; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2138; GFX8-NEXT: v_or_b32_e32 v0, v6, v0 2139; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 2140; GFX8-NEXT: v_or_b32_e32 v2, v4, v2 2141; GFX8-NEXT: s_setpc_b64 s[30:31] 2142; 2143; GFX9-LABEL: v_usubsat_v6i16: 2144; GFX9: ; %bb.0: 2145; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2146; GFX9-NEXT: v_pk_sub_u16 v0, v0, v3 clamp 2147; GFX9-NEXT: v_pk_sub_u16 v1, v1, v4 clamp 2148; GFX9-NEXT: v_pk_sub_u16 v2, v2, v5 clamp 2149; GFX9-NEXT: s_setpc_b64 s[30:31] 2150; 2151; GFX10PLUS-LABEL: v_usubsat_v6i16: 2152; GFX10PLUS: ; %bb.0: 2153; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2154; GFX10PLUS-NEXT: v_pk_sub_u16 v0, v0, v3 clamp 2155; GFX10PLUS-NEXT: v_pk_sub_u16 v1, v1, v4 clamp 2156; GFX10PLUS-NEXT: v_pk_sub_u16 v2, v2, v5 clamp 2157; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 2158 %result = call <6 x i16> @llvm.usub.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs) 2159 %cast = bitcast <6 x i16> %result to <3 x float> 2160 ret <3 x float> %cast 2161} 2162 2163define amdgpu_ps <3 x i32> @s_usubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inreg %rhs) { 2164; GFX6-LABEL: s_usubsat_v6i16: 2165; GFX6: ; %bb.0: 2166; GFX6-NEXT: s_lshl_b32 s0, s0, 16 2167; GFX6-NEXT: s_lshl_b32 s6, s6, 16 2168; GFX6-NEXT: s_min_u32 s6, s0, s6 2169; GFX6-NEXT: s_sub_i32 s0, s0, s6 2170; GFX6-NEXT: s_lshl_b32 s1, s1, 16 2171; GFX6-NEXT: s_lshl_b32 s6, s7, 16 2172; GFX6-NEXT: s_min_u32 s6, s1, s6 2173; GFX6-NEXT: s_sub_i32 s1, s1, s6 2174; GFX6-NEXT: s_lshl_b32 s2, s2, 16 2175; GFX6-NEXT: s_lshl_b32 s6, s8, 16 2176; GFX6-NEXT: s_min_u32 s6, s2, s6 2177; GFX6-NEXT: s_sub_i32 s2, s2, s6 2178; GFX6-NEXT: s_lshl_b32 s3, s3, 16 2179; GFX6-NEXT: s_lshl_b32 s6, s9, 16 2180; GFX6-NEXT: s_min_u32 s6, s3, s6 2181; GFX6-NEXT: s_sub_i32 s3, s3, s6 2182; GFX6-NEXT: s_lshl_b32 s4, s4, 16 2183; GFX6-NEXT: s_lshl_b32 s6, s10, 16 2184; GFX6-NEXT: s_min_u32 s6, s4, s6 2185; GFX6-NEXT: s_sub_i32 s4, s4, s6 2186; GFX6-NEXT: s_lshl_b32 s5, s5, 16 2187; GFX6-NEXT: s_lshl_b32 s6, s11, 16 2188; GFX6-NEXT: s_min_u32 s6, s5, s6 2189; GFX6-NEXT: s_sub_i32 s5, s5, s6 2190; GFX6-NEXT: s_lshr_b32 s1, s1, 16 2191; GFX6-NEXT: s_lshr_b32 s3, s3, 16 2192; GFX6-NEXT: s_lshr_b32 s5, s5, 16 2193; GFX6-NEXT: v_mov_b32_e32 v0, s0 2194; GFX6-NEXT: v_mov_b32_e32 v1, s2 2195; GFX6-NEXT: v_mov_b32_e32 v2, s4 2196; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16 2197; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16 2198; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 16 2199; GFX6-NEXT: v_readfirstlane_b32 s0, v0 2200; GFX6-NEXT: v_readfirstlane_b32 s1, v1 2201; GFX6-NEXT: v_readfirstlane_b32 s2, v2 2202; GFX6-NEXT: ; return to shader part epilog 2203; 2204; GFX8-LABEL: s_usubsat_v6i16: 2205; GFX8: ; %bb.0: 2206; GFX8-NEXT: s_lshr_b32 s6, s0, 16 2207; GFX8-NEXT: s_lshr_b32 s7, s1, 16 2208; GFX8-NEXT: s_lshr_b32 s9, s3, 16 2209; GFX8-NEXT: s_lshr_b32 s10, s4, 16 2210; GFX8-NEXT: s_lshr_b32 s8, s2, 16 2211; GFX8-NEXT: s_lshr_b32 s11, s5, 16 2212; GFX8-NEXT: v_mov_b32_e32 v1, s9 2213; GFX8-NEXT: v_mov_b32_e32 v2, s6 2214; GFX8-NEXT: v_mov_b32_e32 v3, s10 2215; GFX8-NEXT: v_mov_b32_e32 v4, s7 2216; GFX8-NEXT: v_mov_b32_e32 v0, s3 2217; GFX8-NEXT: v_sub_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2218; GFX8-NEXT: v_mov_b32_e32 v2, s4 2219; GFX8-NEXT: v_sub_u16_sdwa v3, v4, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2220; GFX8-NEXT: v_mov_b32_e32 v4, s5 2221; GFX8-NEXT: v_mov_b32_e32 v5, s11 2222; GFX8-NEXT: v_mov_b32_e32 v6, s8 2223; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp 2224; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp 2225; GFX8-NEXT: v_sub_u16_e64 v4, s2, v4 clamp 2226; GFX8-NEXT: v_sub_u16_sdwa v5, v6, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2227; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 2228; GFX8-NEXT: v_or_b32_e32 v1, v2, v3 2229; GFX8-NEXT: v_or_b32_e32 v2, v4, v5 2230; GFX8-NEXT: v_readfirstlane_b32 s0, v0 2231; GFX8-NEXT: v_readfirstlane_b32 s1, v1 2232; GFX8-NEXT: v_readfirstlane_b32 s2, v2 2233; GFX8-NEXT: ; return to shader part epilog 2234; 2235; GFX9-LABEL: s_usubsat_v6i16: 2236; GFX9: ; %bb.0: 2237; GFX9-NEXT: v_mov_b32_e32 v0, s3 2238; GFX9-NEXT: v_mov_b32_e32 v1, s4 2239; GFX9-NEXT: v_mov_b32_e32 v2, s5 2240; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp 2241; GFX9-NEXT: v_pk_sub_u16 v1, s1, v1 clamp 2242; GFX9-NEXT: v_pk_sub_u16 v2, s2, v2 clamp 2243; GFX9-NEXT: v_readfirstlane_b32 s0, v0 2244; GFX9-NEXT: v_readfirstlane_b32 s1, v1 2245; GFX9-NEXT: v_readfirstlane_b32 s2, v2 2246; GFX9-NEXT: ; return to shader part epilog 2247; 2248; GFX10PLUS-LABEL: s_usubsat_v6i16: 2249; GFX10PLUS: ; %bb.0: 2250; GFX10PLUS-NEXT: v_pk_sub_u16 v0, s0, s3 clamp 2251; GFX10PLUS-NEXT: v_pk_sub_u16 v1, s1, s4 clamp 2252; GFX10PLUS-NEXT: v_pk_sub_u16 v2, s2, s5 clamp 2253; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 2254; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 2255; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 2256; GFX10PLUS-NEXT: ; return to shader part epilog 2257 %result = call <6 x i16> @llvm.usub.sat.v6i16(<6 x i16> %lhs, <6 x i16> %rhs) 2258 %cast = bitcast <6 x i16> %result to <3 x i32> 2259 ret <3 x i32> %cast 2260} 2261 2262define <4 x float> @v_usubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { 2263; GFX6-LABEL: v_usubsat_v8i16: 2264; GFX6: ; %bb.0: 2265; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2266; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 2267; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v8 2268; GFX6-NEXT: v_min_u32_e32 v8, v0, v8 2269; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 2270; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 2271; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v9 2272; GFX6-NEXT: v_min_u32_e32 v8, v1, v8 2273; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v1, v8 2274; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 2275; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v10 2276; GFX6-NEXT: v_min_u32_e32 v8, v2, v8 2277; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v8 2278; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2279; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v11 2280; GFX6-NEXT: v_min_u32_e32 v8, v3, v8 2281; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v3, v8 2282; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4 2283; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v12 2284; GFX6-NEXT: v_min_u32_e32 v8, v4, v8 2285; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v8 2286; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5 2287; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v13 2288; GFX6-NEXT: v_min_u32_e32 v8, v5, v8 2289; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v5, v8 2290; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v6 2291; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v14 2292; GFX6-NEXT: v_min_u32_e32 v8, v6, v8 2293; GFX6-NEXT: v_sub_i32_e32 v6, vcc, v6, v8 2294; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7 2295; GFX6-NEXT: v_lshlrev_b32_e32 v8, 16, v15 2296; GFX6-NEXT: v_min_u32_e32 v8, v7, v8 2297; GFX6-NEXT: v_sub_i32_e32 v7, vcc, v7, v8 2298; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1 2299; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3 2300; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5 2301; GFX6-NEXT: v_lshrrev_b32_e32 v7, 16, v7 2302; GFX6-NEXT: v_alignbit_b32 v0, v1, v0, 16 2303; GFX6-NEXT: v_alignbit_b32 v1, v3, v2, 16 2304; GFX6-NEXT: v_alignbit_b32 v2, v5, v4, 16 2305; GFX6-NEXT: v_alignbit_b32 v3, v7, v6, 16 2306; GFX6-NEXT: s_setpc_b64 s[30:31] 2307; 2308; GFX8-LABEL: v_usubsat_v8i16: 2309; GFX8: ; %bb.0: 2310; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2311; GFX8-NEXT: v_sub_u16_e64 v8, v0, v4 clamp 2312; GFX8-NEXT: v_sub_u16_sdwa v0, v0, v4 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2313; GFX8-NEXT: v_sub_u16_e64 v4, v1, v5 clamp 2314; GFX8-NEXT: v_sub_u16_sdwa v1, v1, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2315; GFX8-NEXT: v_sub_u16_e64 v5, v2, v6 clamp 2316; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v6 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2317; GFX8-NEXT: v_sub_u16_e64 v6, v3, v7 clamp 2318; GFX8-NEXT: v_sub_u16_sdwa v3, v3, v7 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 2319; GFX8-NEXT: v_or_b32_e32 v0, v8, v0 2320; GFX8-NEXT: v_or_b32_e32 v1, v4, v1 2321; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 2322; GFX8-NEXT: v_or_b32_e32 v3, v6, v3 2323; GFX8-NEXT: s_setpc_b64 s[30:31] 2324; 2325; GFX9-LABEL: v_usubsat_v8i16: 2326; GFX9: ; %bb.0: 2327; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2328; GFX9-NEXT: v_pk_sub_u16 v0, v0, v4 clamp 2329; GFX9-NEXT: v_pk_sub_u16 v1, v1, v5 clamp 2330; GFX9-NEXT: v_pk_sub_u16 v2, v2, v6 clamp 2331; GFX9-NEXT: v_pk_sub_u16 v3, v3, v7 clamp 2332; GFX9-NEXT: s_setpc_b64 s[30:31] 2333; 2334; GFX10PLUS-LABEL: v_usubsat_v8i16: 2335; GFX10PLUS: ; %bb.0: 2336; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2337; GFX10PLUS-NEXT: v_pk_sub_u16 v0, v0, v4 clamp 2338; GFX10PLUS-NEXT: v_pk_sub_u16 v1, v1, v5 clamp 2339; GFX10PLUS-NEXT: v_pk_sub_u16 v2, v2, v6 clamp 2340; GFX10PLUS-NEXT: v_pk_sub_u16 v3, v3, v7 clamp 2341; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 2342 %result = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) 2343 %cast = bitcast <8 x i16> %result to <4 x float> 2344 ret <4 x float> %cast 2345} 2346 2347define amdgpu_ps <4 x i32> @s_usubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inreg %rhs) { 2348; GFX6-LABEL: s_usubsat_v8i16: 2349; GFX6: ; %bb.0: 2350; GFX6-NEXT: s_lshl_b32 s0, s0, 16 2351; GFX6-NEXT: s_lshl_b32 s8, s8, 16 2352; GFX6-NEXT: s_min_u32 s8, s0, s8 2353; GFX6-NEXT: s_sub_i32 s0, s0, s8 2354; GFX6-NEXT: s_lshl_b32 s1, s1, 16 2355; GFX6-NEXT: s_lshl_b32 s8, s9, 16 2356; GFX6-NEXT: s_min_u32 s8, s1, s8 2357; GFX6-NEXT: s_sub_i32 s1, s1, s8 2358; GFX6-NEXT: s_lshl_b32 s2, s2, 16 2359; GFX6-NEXT: s_lshl_b32 s8, s10, 16 2360; GFX6-NEXT: s_min_u32 s8, s2, s8 2361; GFX6-NEXT: s_sub_i32 s2, s2, s8 2362; GFX6-NEXT: s_lshl_b32 s3, s3, 16 2363; GFX6-NEXT: s_lshl_b32 s8, s11, 16 2364; GFX6-NEXT: s_min_u32 s8, s3, s8 2365; GFX6-NEXT: s_sub_i32 s3, s3, s8 2366; GFX6-NEXT: s_lshl_b32 s4, s4, 16 2367; GFX6-NEXT: s_lshl_b32 s8, s12, 16 2368; GFX6-NEXT: s_min_u32 s8, s4, s8 2369; GFX6-NEXT: s_sub_i32 s4, s4, s8 2370; GFX6-NEXT: s_lshl_b32 s5, s5, 16 2371; GFX6-NEXT: s_lshl_b32 s8, s13, 16 2372; GFX6-NEXT: s_min_u32 s8, s5, s8 2373; GFX6-NEXT: s_sub_i32 s5, s5, s8 2374; GFX6-NEXT: s_lshl_b32 s6, s6, 16 2375; GFX6-NEXT: s_lshl_b32 s8, s14, 16 2376; GFX6-NEXT: s_min_u32 s8, s6, s8 2377; GFX6-NEXT: s_sub_i32 s6, s6, s8 2378; GFX6-NEXT: s_lshl_b32 s7, s7, 16 2379; GFX6-NEXT: s_lshl_b32 s8, s15, 16 2380; GFX6-NEXT: s_min_u32 s8, s7, s8 2381; GFX6-NEXT: s_sub_i32 s7, s7, s8 2382; GFX6-NEXT: s_lshr_b32 s1, s1, 16 2383; GFX6-NEXT: s_lshr_b32 s3, s3, 16 2384; GFX6-NEXT: s_lshr_b32 s5, s5, 16 2385; GFX6-NEXT: s_lshr_b32 s7, s7, 16 2386; GFX6-NEXT: v_mov_b32_e32 v0, s0 2387; GFX6-NEXT: v_mov_b32_e32 v1, s2 2388; GFX6-NEXT: v_mov_b32_e32 v2, s4 2389; GFX6-NEXT: v_mov_b32_e32 v3, s6 2390; GFX6-NEXT: v_alignbit_b32 v0, s1, v0, 16 2391; GFX6-NEXT: v_alignbit_b32 v1, s3, v1, 16 2392; GFX6-NEXT: v_alignbit_b32 v2, s5, v2, 16 2393; GFX6-NEXT: v_alignbit_b32 v3, s7, v3, 16 2394; GFX6-NEXT: v_readfirstlane_b32 s0, v0 2395; GFX6-NEXT: v_readfirstlane_b32 s1, v1 2396; GFX6-NEXT: v_readfirstlane_b32 s2, v2 2397; GFX6-NEXT: v_readfirstlane_b32 s3, v3 2398; GFX6-NEXT: ; return to shader part epilog 2399; 2400; GFX8-LABEL: s_usubsat_v8i16: 2401; GFX8: ; %bb.0: 2402; GFX8-NEXT: s_lshr_b32 s8, s0, 16 2403; GFX8-NEXT: s_lshr_b32 s9, s1, 16 2404; GFX8-NEXT: s_lshr_b32 s10, s2, 16 2405; GFX8-NEXT: s_lshr_b32 s12, s4, 16 2406; GFX8-NEXT: s_lshr_b32 s13, s5, 16 2407; GFX8-NEXT: s_lshr_b32 s14, s6, 16 2408; GFX8-NEXT: s_lshr_b32 s11, s3, 16 2409; GFX8-NEXT: s_lshr_b32 s15, s7, 16 2410; GFX8-NEXT: v_mov_b32_e32 v1, s12 2411; GFX8-NEXT: v_mov_b32_e32 v2, s8 2412; GFX8-NEXT: v_mov_b32_e32 v3, s13 2413; GFX8-NEXT: v_mov_b32_e32 v4, s9 2414; GFX8-NEXT: v_mov_b32_e32 v5, s14 2415; GFX8-NEXT: v_mov_b32_e32 v6, s10 2416; GFX8-NEXT: v_mov_b32_e32 v0, s4 2417; GFX8-NEXT: v_sub_u16_sdwa v1, v2, v1 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2418; GFX8-NEXT: v_mov_b32_e32 v2, s5 2419; GFX8-NEXT: v_sub_u16_sdwa v3, v4, v3 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2420; GFX8-NEXT: v_mov_b32_e32 v4, s6 2421; GFX8-NEXT: v_sub_u16_sdwa v5, v6, v5 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2422; GFX8-NEXT: v_mov_b32_e32 v6, s7 2423; GFX8-NEXT: v_mov_b32_e32 v7, s15 2424; GFX8-NEXT: v_mov_b32_e32 v8, s11 2425; GFX8-NEXT: v_sub_u16_e64 v0, s0, v0 clamp 2426; GFX8-NEXT: v_sub_u16_e64 v2, s1, v2 clamp 2427; GFX8-NEXT: v_sub_u16_e64 v4, s2, v4 clamp 2428; GFX8-NEXT: v_sub_u16_e64 v6, s3, v6 clamp 2429; GFX8-NEXT: v_sub_u16_sdwa v7, v8, v7 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD 2430; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 2431; GFX8-NEXT: v_or_b32_e32 v1, v2, v3 2432; GFX8-NEXT: v_or_b32_e32 v2, v4, v5 2433; GFX8-NEXT: v_or_b32_e32 v3, v6, v7 2434; GFX8-NEXT: v_readfirstlane_b32 s0, v0 2435; GFX8-NEXT: v_readfirstlane_b32 s1, v1 2436; GFX8-NEXT: v_readfirstlane_b32 s2, v2 2437; GFX8-NEXT: v_readfirstlane_b32 s3, v3 2438; GFX8-NEXT: ; return to shader part epilog 2439; 2440; GFX9-LABEL: s_usubsat_v8i16: 2441; GFX9: ; %bb.0: 2442; GFX9-NEXT: v_mov_b32_e32 v0, s4 2443; GFX9-NEXT: v_mov_b32_e32 v1, s5 2444; GFX9-NEXT: v_mov_b32_e32 v2, s6 2445; GFX9-NEXT: v_mov_b32_e32 v3, s7 2446; GFX9-NEXT: v_pk_sub_u16 v0, s0, v0 clamp 2447; GFX9-NEXT: v_pk_sub_u16 v1, s1, v1 clamp 2448; GFX9-NEXT: v_pk_sub_u16 v2, s2, v2 clamp 2449; GFX9-NEXT: v_pk_sub_u16 v3, s3, v3 clamp 2450; GFX9-NEXT: v_readfirstlane_b32 s0, v0 2451; GFX9-NEXT: v_readfirstlane_b32 s1, v1 2452; GFX9-NEXT: v_readfirstlane_b32 s2, v2 2453; GFX9-NEXT: v_readfirstlane_b32 s3, v3 2454; GFX9-NEXT: ; return to shader part epilog 2455; 2456; GFX10PLUS-LABEL: s_usubsat_v8i16: 2457; GFX10PLUS: ; %bb.0: 2458; GFX10PLUS-NEXT: v_pk_sub_u16 v0, s0, s4 clamp 2459; GFX10PLUS-NEXT: v_pk_sub_u16 v1, s1, s5 clamp 2460; GFX10PLUS-NEXT: v_pk_sub_u16 v2, s2, s6 clamp 2461; GFX10PLUS-NEXT: v_pk_sub_u16 v3, s3, s7 clamp 2462; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 2463; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 2464; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 2465; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3 2466; GFX10PLUS-NEXT: ; return to shader part epilog 2467 %result = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) 2468 %cast = bitcast <8 x i16> %result to <4 x i32> 2469 ret <4 x i32> %cast 2470} 2471 2472define i48 @v_usubsat_i48(i48 %lhs, i48 %rhs) { 2473; GFX6-LABEL: v_usubsat_i48: 2474; GFX6: ; %bb.0: 2475; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2476; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 2477; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v3 2478; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 2479; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc 2480; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 2481; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v1 2482; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 2483; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2484; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 2485; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2 2486; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 2487; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc 2488; GFX6-NEXT: s_setpc_b64 s[30:31] 2489; 2490; GFX8-LABEL: v_usubsat_i48: 2491; GFX8: ; %bb.0: 2492; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2493; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] 2494; GFX8-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] 2495; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 2496; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc 2497; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 2498; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc 2499; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] 2500; GFX8-NEXT: s_setpc_b64 s[30:31] 2501; 2502; GFX9-LABEL: v_usubsat_i48: 2503; GFX9: ; %bb.0: 2504; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2505; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] 2506; GFX9-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] 2507; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 2508; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc 2509; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 2510; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc 2511; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] 2512; GFX9-NEXT: s_setpc_b64 s[30:31] 2513; 2514; GFX10PLUS-LABEL: v_usubsat_i48: 2515; GFX10PLUS: ; %bb.0: 2516; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2517; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] 2518; GFX10PLUS-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3] 2519; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 2520; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo 2521; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo 2522; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo 2523; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] 2524; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 2525 %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs) 2526 ret i48 %result 2527} 2528 2529define amdgpu_ps i48 @s_usubsat_i48(i48 inreg %lhs, i48 inreg %rhs) { 2530; GFX6-LABEL: s_usubsat_i48: 2531; GFX6: ; %bb.0: 2532; GFX6-NEXT: s_sub_u32 s0, s0, s2 2533; GFX6-NEXT: s_cselect_b32 s2, 1, 0 2534; GFX6-NEXT: s_and_b32 s1, s1, 0xffff 2535; GFX6-NEXT: s_and_b32 s3, s3, 0xffff 2536; GFX6-NEXT: s_cmp_lg_u32 s2, 0 2537; GFX6-NEXT: s_subb_u32 s2, s1, s3 2538; GFX6-NEXT: s_and_b32 s1, s2, 0xffff 2539; GFX6-NEXT: s_cmp_lg_u32 s2, s1 2540; GFX6-NEXT: s_cselect_b32 s2, 1, 0 2541; GFX6-NEXT: s_lshr_b32 s3, s0, 16 2542; GFX6-NEXT: s_and_b32 s0, s0, 0xffff 2543; GFX6-NEXT: s_lshl_b32 s3, s3, 16 2544; GFX6-NEXT: s_or_b32 s0, s0, s3 2545; GFX6-NEXT: s_cmp_lg_u32 s2, 0 2546; GFX6-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] 2547; GFX6-NEXT: ; return to shader part epilog 2548; 2549; GFX8-LABEL: s_usubsat_i48: 2550; GFX8: ; %bb.0: 2551; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 2552; GFX8-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 2553; GFX8-NEXT: s_sub_u32 s0, s0, s2 2554; GFX8-NEXT: s_subb_u32 s1, s1, s3 2555; GFX8-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] 2556; GFX8-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 2557; GFX8-NEXT: ; return to shader part epilog 2558; 2559; GFX9-LABEL: s_usubsat_i48: 2560; GFX9: ; %bb.0: 2561; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 2562; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 2563; GFX9-NEXT: s_sub_u32 s0, s0, s2 2564; GFX9-NEXT: s_subb_u32 s1, s1, s3 2565; GFX9-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] 2566; GFX9-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 2567; GFX9-NEXT: ; return to shader part epilog 2568; 2569; GFX10PLUS-LABEL: s_usubsat_i48: 2570; GFX10PLUS: ; %bb.0: 2571; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 2572; GFX10PLUS-NEXT: s_lshl_b64 s[2:3], s[2:3], 16 2573; GFX10PLUS-NEXT: s_sub_u32 s0, s0, s2 2574; GFX10PLUS-NEXT: s_subb_u32 s1, s1, s3 2575; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] 2576; GFX10PLUS-NEXT: s_lshr_b64 s[0:1], s[0:1], 16 2577; GFX10PLUS-NEXT: ; return to shader part epilog 2578 %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs) 2579 ret i48 %result 2580} 2581 2582define amdgpu_ps <2 x float> @usubsat_i48_sv(i48 inreg %lhs, i48 %rhs) { 2583; GFX6-LABEL: usubsat_i48_sv: 2584; GFX6: ; %bb.0: 2585; GFX6-NEXT: s_and_b32 s1, s1, 0xffff 2586; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 2587; GFX6-NEXT: v_mov_b32_e32 v2, s1 2588; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2589; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2590; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 2591; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v1 2592; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 2593; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2594; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 2595; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2 2596; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 2597; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc 2598; GFX6-NEXT: ; return to shader part epilog 2599; 2600; GFX8-LABEL: usubsat_i48_sv: 2601; GFX8: ; %bb.0: 2602; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] 2603; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 2604; GFX8-NEXT: v_mov_b32_e32 v2, s1 2605; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 2606; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2607; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 2608; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc 2609; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] 2610; GFX8-NEXT: ; return to shader part epilog 2611; 2612; GFX9-LABEL: usubsat_i48_sv: 2613; GFX9: ; %bb.0: 2614; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] 2615; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 2616; GFX9-NEXT: v_mov_b32_e32 v2, s1 2617; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 2618; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2619; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 2620; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc 2621; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] 2622; GFX9-NEXT: ; return to shader part epilog 2623; 2624; GFX10PLUS-LABEL: usubsat_i48_sv: 2625; GFX10PLUS: ; %bb.0: 2626; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] 2627; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 2628; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0 2629; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo 2630; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo 2631; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo 2632; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] 2633; GFX10PLUS-NEXT: ; return to shader part epilog 2634 %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs) 2635 %ext.result = zext i48 %result to i64 2636 %cast = bitcast i64 %ext.result to <2 x float> 2637 ret <2 x float> %cast 2638} 2639 2640define amdgpu_ps <2 x float> @usubsat_i48_vs(i48 %lhs, i48 inreg %rhs) { 2641; GFX6-LABEL: usubsat_i48_vs: 2642; GFX6: ; %bb.0: 2643; GFX6-NEXT: s_and_b32 s1, s1, 0xffff 2644; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 2645; GFX6-NEXT: v_mov_b32_e32 v2, s1 2646; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 2647; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 2648; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v0 2649; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v1 2650; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 2651; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 2652; GFX6-NEXT: v_or_b32_e32 v0, v0, v3 2653; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, v1, v2 2654; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 2655; GFX6-NEXT: v_cndmask_b32_e64 v1, v2, 0, vcc 2656; GFX6-NEXT: ; return to shader part epilog 2657; 2658; GFX8-LABEL: usubsat_i48_vs: 2659; GFX8: ; %bb.0: 2660; GFX8-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] 2661; GFX8-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 2662; GFX8-NEXT: v_mov_b32_e32 v2, s1 2663; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0 2664; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 2665; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 2666; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc 2667; GFX8-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] 2668; GFX8-NEXT: ; return to shader part epilog 2669; 2670; GFX9-LABEL: usubsat_i48_vs: 2671; GFX9: ; %bb.0: 2672; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] 2673; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 2674; GFX9-NEXT: v_mov_b32_e32 v2, s1 2675; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 2676; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc 2677; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 2678; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc 2679; GFX9-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] 2680; GFX9-NEXT: ; return to shader part epilog 2681; 2682; GFX10PLUS-LABEL: usubsat_i48_vs: 2683; GFX10PLUS: ; %bb.0: 2684; GFX10PLUS-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1] 2685; GFX10PLUS-NEXT: s_lshl_b64 s[0:1], s[0:1], 16 2686; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s0 2687; GFX10PLUS-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo 2688; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo 2689; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo 2690; GFX10PLUS-NEXT: v_lshrrev_b64 v[0:1], 16, v[0:1] 2691; GFX10PLUS-NEXT: ; return to shader part epilog 2692 %result = call i48 @llvm.usub.sat.i48(i48 %lhs, i48 %rhs) 2693 %ext.result = zext i48 %result to i64 2694 %cast = bitcast i64 %ext.result to <2 x float> 2695 ret <2 x float> %cast 2696} 2697 2698define i64 @v_usubsat_i64(i64 %lhs, i64 %rhs) { 2699; GFX6-LABEL: v_usubsat_i64: 2700; GFX6: ; %bb.0: 2701; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2702; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 2703; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc 2704; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 2705; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc 2706; GFX6-NEXT: s_setpc_b64 s[30:31] 2707; 2708; GFX8-LABEL: v_usubsat_i64: 2709; GFX8: ; %bb.0: 2710; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2711; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v2 2712; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc 2713; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 2714; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc 2715; GFX8-NEXT: s_setpc_b64 s[30:31] 2716; 2717; GFX9-LABEL: v_usubsat_i64: 2718; GFX9: ; %bb.0: 2719; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2720; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2 2721; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc 2722; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 2723; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc 2724; GFX9-NEXT: s_setpc_b64 s[30:31] 2725; 2726; GFX10PLUS-LABEL: v_usubsat_i64: 2727; GFX10PLUS: ; %bb.0: 2728; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2729; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v2 2730; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo 2731; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo 2732; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo 2733; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] 2734 %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs) 2735 ret i64 %result 2736} 2737 2738define amdgpu_ps i64 @s_usubsat_i64(i64 inreg %lhs, i64 inreg %rhs) { 2739; GFX6-LABEL: s_usubsat_i64: 2740; GFX6: ; %bb.0: 2741; GFX6-NEXT: s_sub_u32 s0, s0, s2 2742; GFX6-NEXT: s_subb_u32 s1, s1, s3 2743; GFX6-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] 2744; GFX6-NEXT: ; return to shader part epilog 2745; 2746; GFX8-LABEL: s_usubsat_i64: 2747; GFX8: ; %bb.0: 2748; GFX8-NEXT: s_sub_u32 s0, s0, s2 2749; GFX8-NEXT: s_subb_u32 s1, s1, s3 2750; GFX8-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] 2751; GFX8-NEXT: ; return to shader part epilog 2752; 2753; GFX9-LABEL: s_usubsat_i64: 2754; GFX9: ; %bb.0: 2755; GFX9-NEXT: s_sub_u32 s0, s0, s2 2756; GFX9-NEXT: s_subb_u32 s1, s1, s3 2757; GFX9-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] 2758; GFX9-NEXT: ; return to shader part epilog 2759; 2760; GFX10PLUS-LABEL: s_usubsat_i64: 2761; GFX10PLUS: ; %bb.0: 2762; GFX10PLUS-NEXT: s_sub_u32 s0, s0, s2 2763; GFX10PLUS-NEXT: s_subb_u32 s1, s1, s3 2764; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] 2765; GFX10PLUS-NEXT: ; return to shader part epilog 2766 %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs) 2767 ret i64 %result 2768} 2769 2770define amdgpu_ps <2 x float> @usubsat_i64_sv(i64 inreg %lhs, i64 %rhs) { 2771; GFX6-LABEL: usubsat_i64_sv: 2772; GFX6: ; %bb.0: 2773; GFX6-NEXT: v_mov_b32_e32 v2, s1 2774; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 2775; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2776; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 2777; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc 2778; GFX6-NEXT: ; return to shader part epilog 2779; 2780; GFX8-LABEL: usubsat_i64_sv: 2781; GFX8: ; %bb.0: 2782; GFX8-NEXT: v_mov_b32_e32 v2, s1 2783; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 2784; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc 2785; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 2786; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc 2787; GFX8-NEXT: ; return to shader part epilog 2788; 2789; GFX9-LABEL: usubsat_i64_sv: 2790; GFX9: ; %bb.0: 2791; GFX9-NEXT: v_mov_b32_e32 v2, s1 2792; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 2793; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc 2794; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 2795; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc 2796; GFX9-NEXT: ; return to shader part epilog 2797; 2798; GFX10PLUS-LABEL: usubsat_i64_sv: 2799; GFX10PLUS: ; %bb.0: 2800; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0 2801; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo 2802; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo 2803; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo 2804; GFX10PLUS-NEXT: ; return to shader part epilog 2805 %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs) 2806 %cast = bitcast i64 %result to <2 x float> 2807 ret <2 x float> %cast 2808} 2809 2810define amdgpu_ps <2 x float> @usubsat_i64_vs(i64 %lhs, i64 inreg %rhs) { 2811; GFX6-LABEL: usubsat_i64_vs: 2812; GFX6: ; %bb.0: 2813; GFX6-NEXT: v_mov_b32_e32 v2, s1 2814; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 2815; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 2816; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 2817; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc 2818; GFX6-NEXT: ; return to shader part epilog 2819; 2820; GFX8-LABEL: usubsat_i64_vs: 2821; GFX8: ; %bb.0: 2822; GFX8-NEXT: v_mov_b32_e32 v2, s1 2823; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0 2824; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc 2825; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 2826; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc 2827; GFX8-NEXT: ; return to shader part epilog 2828; 2829; GFX9-LABEL: usubsat_i64_vs: 2830; GFX9: ; %bb.0: 2831; GFX9-NEXT: v_mov_b32_e32 v2, s1 2832; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 2833; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v2, vcc 2834; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 2835; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc 2836; GFX9-NEXT: ; return to shader part epilog 2837; 2838; GFX10PLUS-LABEL: usubsat_i64_vs: 2839; GFX10PLUS: ; %bb.0: 2840; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s0 2841; GFX10PLUS-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo 2842; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo 2843; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo 2844; GFX10PLUS-NEXT: ; return to shader part epilog 2845 %result = call i64 @llvm.usub.sat.i64(i64 %lhs, i64 %rhs) 2846 %cast = bitcast i64 %result to <2 x float> 2847 ret <2 x float> %cast 2848} 2849 2850define <2 x i64> @v_usubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { 2851; GFX6-LABEL: v_usubsat_v2i64: 2852; GFX6: ; %bb.0: 2853; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2854; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v4 2855; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc 2856; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 2857; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc 2858; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v2, v6 2859; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc 2860; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc 2861; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc 2862; GFX6-NEXT: s_setpc_b64 s[30:31] 2863; 2864; GFX8-LABEL: v_usubsat_v2i64: 2865; GFX8: ; %bb.0: 2866; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2867; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v4 2868; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc 2869; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 2870; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc 2871; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v2, v6 2872; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc 2873; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc 2874; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc 2875; GFX8-NEXT: s_setpc_b64 s[30:31] 2876; 2877; GFX9-LABEL: v_usubsat_v2i64: 2878; GFX9: ; %bb.0: 2879; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2880; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4 2881; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc 2882; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 2883; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc 2884; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v6 2885; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v7, vcc 2886; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc 2887; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc 2888; GFX9-NEXT: s_setpc_b64 s[30:31] 2889; 2890; GFX10-LABEL: v_usubsat_v2i64: 2891; GFX10: ; %bb.0: 2892; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2893; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4 2894; GFX10-NEXT: v_sub_co_u32 v2, s4, v2, v6 2895; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo 2896; GFX10-NEXT: v_sub_co_ci_u32_e64 v3, s4, v3, v7, s4 2897; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo 2898; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo 2899; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, 0, s4 2900; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, 0, s4 2901; GFX10-NEXT: s_setpc_b64 s[30:31] 2902; 2903; GFX11-LABEL: v_usubsat_v2i64: 2904; GFX11: ; %bb.0: 2905; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 2906; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v4 2907; GFX11-NEXT: v_sub_co_u32 v2, s0, v2, v6 2908; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v5, vcc_lo 2909; GFX11-NEXT: v_sub_co_ci_u32_e64 v3, s0, v3, v7, s0 2910; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo 2911; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo 2912; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, 0, s0 2913; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, 0, s0 2914; GFX11-NEXT: s_setpc_b64 s[30:31] 2915 %result = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) 2916 ret <2 x i64> %result 2917} 2918 2919define amdgpu_ps <2 x i64> @s_usubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inreg %rhs) { 2920; GFX6-LABEL: s_usubsat_v2i64: 2921; GFX6: ; %bb.0: 2922; GFX6-NEXT: s_sub_u32 s0, s0, s4 2923; GFX6-NEXT: s_subb_u32 s1, s1, s5 2924; GFX6-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] 2925; GFX6-NEXT: s_sub_u32 s2, s2, s6 2926; GFX6-NEXT: s_subb_u32 s3, s3, s7 2927; GFX6-NEXT: s_cselect_b64 s[2:3], 0, s[2:3] 2928; GFX6-NEXT: ; return to shader part epilog 2929; 2930; GFX8-LABEL: s_usubsat_v2i64: 2931; GFX8: ; %bb.0: 2932; GFX8-NEXT: s_sub_u32 s0, s0, s4 2933; GFX8-NEXT: s_subb_u32 s1, s1, s5 2934; GFX8-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] 2935; GFX8-NEXT: s_sub_u32 s2, s2, s6 2936; GFX8-NEXT: s_subb_u32 s3, s3, s7 2937; GFX8-NEXT: s_cselect_b64 s[2:3], 0, s[2:3] 2938; GFX8-NEXT: ; return to shader part epilog 2939; 2940; GFX9-LABEL: s_usubsat_v2i64: 2941; GFX9: ; %bb.0: 2942; GFX9-NEXT: s_sub_u32 s0, s0, s4 2943; GFX9-NEXT: s_subb_u32 s1, s1, s5 2944; GFX9-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] 2945; GFX9-NEXT: s_sub_u32 s2, s2, s6 2946; GFX9-NEXT: s_subb_u32 s3, s3, s7 2947; GFX9-NEXT: s_cselect_b64 s[2:3], 0, s[2:3] 2948; GFX9-NEXT: ; return to shader part epilog 2949; 2950; GFX10PLUS-LABEL: s_usubsat_v2i64: 2951; GFX10PLUS: ; %bb.0: 2952; GFX10PLUS-NEXT: s_sub_u32 s0, s0, s4 2953; GFX10PLUS-NEXT: s_subb_u32 s1, s1, s5 2954; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] 2955; GFX10PLUS-NEXT: s_sub_u32 s2, s2, s6 2956; GFX10PLUS-NEXT: s_subb_u32 s3, s3, s7 2957; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], 0, s[2:3] 2958; GFX10PLUS-NEXT: ; return to shader part epilog 2959 %result = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) 2960 ret <2 x i64> %result 2961} 2962 2963define amdgpu_ps i128 @s_usubsat_i128(i128 inreg %lhs, i128 inreg %rhs) { 2964; GFX6-LABEL: s_usubsat_i128: 2965; GFX6: ; %bb.0: 2966; GFX6-NEXT: s_sub_u32 s0, s0, s4 2967; GFX6-NEXT: s_subb_u32 s1, s1, s5 2968; GFX6-NEXT: s_subb_u32 s2, s2, s6 2969; GFX6-NEXT: s_subb_u32 s3, s3, s7 2970; GFX6-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] 2971; GFX6-NEXT: s_cselect_b64 s[2:3], 0, s[2:3] 2972; GFX6-NEXT: ; return to shader part epilog 2973; 2974; GFX8-LABEL: s_usubsat_i128: 2975; GFX8: ; %bb.0: 2976; GFX8-NEXT: s_sub_u32 s0, s0, s4 2977; GFX8-NEXT: s_subb_u32 s1, s1, s5 2978; GFX8-NEXT: s_subb_u32 s2, s2, s6 2979; GFX8-NEXT: s_subb_u32 s3, s3, s7 2980; GFX8-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] 2981; GFX8-NEXT: s_cselect_b64 s[2:3], 0, s[2:3] 2982; GFX8-NEXT: ; return to shader part epilog 2983; 2984; GFX9-LABEL: s_usubsat_i128: 2985; GFX9: ; %bb.0: 2986; GFX9-NEXT: s_sub_u32 s0, s0, s4 2987; GFX9-NEXT: s_subb_u32 s1, s1, s5 2988; GFX9-NEXT: s_subb_u32 s2, s2, s6 2989; GFX9-NEXT: s_subb_u32 s3, s3, s7 2990; GFX9-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] 2991; GFX9-NEXT: s_cselect_b64 s[2:3], 0, s[2:3] 2992; GFX9-NEXT: ; return to shader part epilog 2993; 2994; GFX10PLUS-LABEL: s_usubsat_i128: 2995; GFX10PLUS: ; %bb.0: 2996; GFX10PLUS-NEXT: s_sub_u32 s0, s0, s4 2997; GFX10PLUS-NEXT: s_subb_u32 s1, s1, s5 2998; GFX10PLUS-NEXT: s_subb_u32 s2, s2, s6 2999; GFX10PLUS-NEXT: s_subb_u32 s3, s3, s7 3000; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] 3001; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], 0, s[2:3] 3002; GFX10PLUS-NEXT: ; return to shader part epilog 3003 %result = call i128 @llvm.usub.sat.i128(i128 %lhs, i128 %rhs) 3004 ret i128 %result 3005} 3006 3007define amdgpu_ps <4 x float> @usubsat_i128_sv(i128 inreg %lhs, i128 %rhs) { 3008; GFX6-LABEL: usubsat_i128_sv: 3009; GFX6: ; %bb.0: 3010; GFX6-NEXT: v_mov_b32_e32 v4, s1 3011; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 3012; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc 3013; GFX6-NEXT: v_mov_b32_e32 v4, s2 3014; GFX6-NEXT: v_mov_b32_e32 v5, s3 3015; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v4, v2, vcc 3016; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc 3017; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 3018; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc 3019; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc 3020; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc 3021; GFX6-NEXT: ; return to shader part epilog 3022; 3023; GFX8-LABEL: usubsat_i128_sv: 3024; GFX8: ; %bb.0: 3025; GFX8-NEXT: v_mov_b32_e32 v4, s1 3026; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v0 3027; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v4, v1, vcc 3028; GFX8-NEXT: v_mov_b32_e32 v4, s2 3029; GFX8-NEXT: v_mov_b32_e32 v5, s3 3030; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v4, v2, vcc 3031; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc 3032; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 3033; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc 3034; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc 3035; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc 3036; GFX8-NEXT: ; return to shader part epilog 3037; 3038; GFX9-LABEL: usubsat_i128_sv: 3039; GFX9: ; %bb.0: 3040; GFX9-NEXT: v_mov_b32_e32 v4, s1 3041; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0 3042; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v4, v1, vcc 3043; GFX9-NEXT: v_mov_b32_e32 v4, s2 3044; GFX9-NEXT: v_mov_b32_e32 v5, s3 3045; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v4, v2, vcc 3046; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v5, v3, vcc 3047; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 3048; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc 3049; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc 3050; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc 3051; GFX9-NEXT: ; return to shader part epilog 3052; 3053; GFX10PLUS-LABEL: usubsat_i128_sv: 3054; GFX10PLUS: ; %bb.0: 3055; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v0 3056; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo 3057; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo 3058; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo 3059; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo 3060; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo 3061; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo 3062; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo 3063; GFX10PLUS-NEXT: ; return to shader part epilog 3064 %result = call i128 @llvm.usub.sat.i128(i128 %lhs, i128 %rhs) 3065 %cast = bitcast i128 %result to <4 x float> 3066 ret <4 x float> %cast 3067} 3068 3069define amdgpu_ps <4 x float> @usubsat_i128_vs(i128 %lhs, i128 inreg %rhs) { 3070; GFX6-LABEL: usubsat_i128_vs: 3071; GFX6: ; %bb.0: 3072; GFX6-NEXT: v_mov_b32_e32 v4, s1 3073; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 3074; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc 3075; GFX6-NEXT: v_mov_b32_e32 v4, s2 3076; GFX6-NEXT: v_mov_b32_e32 v5, s3 3077; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v2, v4, vcc 3078; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc 3079; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 3080; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc 3081; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc 3082; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc 3083; GFX6-NEXT: ; return to shader part epilog 3084; 3085; GFX8-LABEL: usubsat_i128_vs: 3086; GFX8: ; %bb.0: 3087; GFX8-NEXT: v_mov_b32_e32 v4, s1 3088; GFX8-NEXT: v_subrev_u32_e32 v0, vcc, s0, v0 3089; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc 3090; GFX8-NEXT: v_mov_b32_e32 v4, s2 3091; GFX8-NEXT: v_mov_b32_e32 v5, s3 3092; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v4, vcc 3093; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc 3094; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 3095; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc 3096; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc 3097; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc 3098; GFX8-NEXT: ; return to shader part epilog 3099; 3100; GFX9-LABEL: usubsat_i128_vs: 3101; GFX9: ; %bb.0: 3102; GFX9-NEXT: v_mov_b32_e32 v4, s1 3103; GFX9-NEXT: v_subrev_co_u32_e32 v0, vcc, s0, v0 3104; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v4, vcc 3105; GFX9-NEXT: v_mov_b32_e32 v4, s2 3106; GFX9-NEXT: v_mov_b32_e32 v5, s3 3107; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v4, vcc 3108; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v5, vcc 3109; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 3110; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc 3111; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc 3112; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc 3113; GFX9-NEXT: ; return to shader part epilog 3114; 3115; GFX10PLUS-LABEL: usubsat_i128_vs: 3116; GFX10PLUS: ; %bb.0: 3117; GFX10PLUS-NEXT: v_sub_co_u32 v0, vcc_lo, v0, s0 3118; GFX10PLUS-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo 3119; GFX10PLUS-NEXT: v_subrev_co_ci_u32_e32 v2, vcc_lo, s2, v2, vcc_lo 3120; GFX10PLUS-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s3, v3, vcc_lo 3121; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo 3122; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo 3123; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo 3124; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo 3125; GFX10PLUS-NEXT: ; return to shader part epilog 3126 %result = call i128 @llvm.usub.sat.i128(i128 %lhs, i128 %rhs) 3127 %cast = bitcast i128 %result to <4 x float> 3128 ret <4 x float> %cast 3129} 3130 3131define <2 x i128> @v_usubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) { 3132; GFX6-LABEL: v_usubsat_v2i128: 3133; GFX6: ; %bb.0: 3134; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3135; GFX6-NEXT: v_sub_i32_e32 v0, vcc, v0, v8 3136; GFX6-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc 3137; GFX6-NEXT: v_subb_u32_e32 v2, vcc, v2, v10, vcc 3138; GFX6-NEXT: v_subb_u32_e32 v3, vcc, v3, v11, vcc 3139; GFX6-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 3140; GFX6-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc 3141; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc 3142; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc 3143; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v4, v12 3144; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v5, v13, vcc 3145; GFX6-NEXT: v_subb_u32_e32 v6, vcc, v6, v14, vcc 3146; GFX6-NEXT: v_subb_u32_e32 v7, vcc, v7, v15, vcc 3147; GFX6-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc 3148; GFX6-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc 3149; GFX6-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc 3150; GFX6-NEXT: v_cndmask_b32_e64 v7, v7, 0, vcc 3151; GFX6-NEXT: s_setpc_b64 s[30:31] 3152; 3153; GFX8-LABEL: v_usubsat_v2i128: 3154; GFX8: ; %bb.0: 3155; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3156; GFX8-NEXT: v_sub_u32_e32 v0, vcc, v0, v8 3157; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v1, v9, vcc 3158; GFX8-NEXT: v_subb_u32_e32 v2, vcc, v2, v10, vcc 3159; GFX8-NEXT: v_subb_u32_e32 v3, vcc, v3, v11, vcc 3160; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 3161; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc 3162; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc 3163; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc 3164; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v4, v12 3165; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v5, v13, vcc 3166; GFX8-NEXT: v_subb_u32_e32 v6, vcc, v6, v14, vcc 3167; GFX8-NEXT: v_subb_u32_e32 v7, vcc, v7, v15, vcc 3168; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc 3169; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc 3170; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc 3171; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, 0, vcc 3172; GFX8-NEXT: s_setpc_b64 s[30:31] 3173; 3174; GFX9-LABEL: v_usubsat_v2i128: 3175; GFX9: ; %bb.0: 3176; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3177; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v8 3178; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v9, vcc 3179; GFX9-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v10, vcc 3180; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v11, vcc 3181; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc 3182; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc 3183; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc 3184; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc 3185; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v12 3186; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v13, vcc 3187; GFX9-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v14, vcc 3188; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v15, vcc 3189; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc 3190; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc 3191; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc 3192; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, 0, vcc 3193; GFX9-NEXT: s_setpc_b64 s[30:31] 3194; 3195; GFX10-LABEL: v_usubsat_v2i128: 3196; GFX10: ; %bb.0: 3197; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3198; GFX10-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v8 3199; GFX10-NEXT: v_sub_co_u32 v4, s4, v4, v12 3200; GFX10-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo 3201; GFX10-NEXT: v_sub_co_ci_u32_e64 v5, s4, v5, v13, s4 3202; GFX10-NEXT: v_sub_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo 3203; GFX10-NEXT: v_sub_co_ci_u32_e64 v6, s4, v6, v14, s4 3204; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo 3205; GFX10-NEXT: v_sub_co_ci_u32_e64 v7, s4, v7, v15, s4 3206; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo 3207; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo 3208; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo 3209; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo 3210; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, 0, s4 3211; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, 0, s4 3212; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, 0, s4 3213; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, 0, s4 3214; GFX10-NEXT: s_setpc_b64 s[30:31] 3215; 3216; GFX11-LABEL: v_usubsat_v2i128: 3217; GFX11: ; %bb.0: 3218; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) 3219; GFX11-NEXT: v_sub_co_u32 v0, vcc_lo, v0, v8 3220; GFX11-NEXT: v_sub_co_u32 v4, s0, v4, v12 3221; GFX11-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, v1, v9, vcc_lo 3222; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, s0, v5, v13, s0 3223; GFX11-NEXT: v_sub_co_ci_u32_e32 v2, vcc_lo, v2, v10, vcc_lo 3224; GFX11-NEXT: v_sub_co_ci_u32_e64 v6, s0, v6, v14, s0 3225; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, v3, v11, vcc_lo 3226; GFX11-NEXT: v_sub_co_ci_u32_e64 v7, s0, v7, v15, s0 3227; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo 3228; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc_lo 3229; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc_lo 3230; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc_lo 3231; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, 0, s0 3232; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, 0, s0 3233; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, 0, s0 3234; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, 0, s0 3235; GFX11-NEXT: s_setpc_b64 s[30:31] 3236 %result = call <2 x i128> @llvm.usub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) 3237 ret <2 x i128> %result 3238} 3239 3240define amdgpu_ps <2 x i128> @s_usubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs) { 3241; GFX6-LABEL: s_usubsat_v2i128: 3242; GFX6: ; %bb.0: 3243; GFX6-NEXT: s_sub_u32 s0, s0, s8 3244; GFX6-NEXT: s_subb_u32 s1, s1, s9 3245; GFX6-NEXT: s_subb_u32 s2, s2, s10 3246; GFX6-NEXT: s_subb_u32 s3, s3, s11 3247; GFX6-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] 3248; GFX6-NEXT: s_cselect_b64 s[2:3], 0, s[2:3] 3249; GFX6-NEXT: s_sub_u32 s4, s4, s12 3250; GFX6-NEXT: s_subb_u32 s5, s5, s13 3251; GFX6-NEXT: s_subb_u32 s6, s6, s14 3252; GFX6-NEXT: s_subb_u32 s7, s7, s15 3253; GFX6-NEXT: s_cselect_b64 s[4:5], 0, s[4:5] 3254; GFX6-NEXT: s_cselect_b64 s[6:7], 0, s[6:7] 3255; GFX6-NEXT: ; return to shader part epilog 3256; 3257; GFX8-LABEL: s_usubsat_v2i128: 3258; GFX8: ; %bb.0: 3259; GFX8-NEXT: s_sub_u32 s0, s0, s8 3260; GFX8-NEXT: s_subb_u32 s1, s1, s9 3261; GFX8-NEXT: s_subb_u32 s2, s2, s10 3262; GFX8-NEXT: s_subb_u32 s3, s3, s11 3263; GFX8-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] 3264; GFX8-NEXT: s_cselect_b64 s[2:3], 0, s[2:3] 3265; GFX8-NEXT: s_sub_u32 s4, s4, s12 3266; GFX8-NEXT: s_subb_u32 s5, s5, s13 3267; GFX8-NEXT: s_subb_u32 s6, s6, s14 3268; GFX8-NEXT: s_subb_u32 s7, s7, s15 3269; GFX8-NEXT: s_cselect_b64 s[4:5], 0, s[4:5] 3270; GFX8-NEXT: s_cselect_b64 s[6:7], 0, s[6:7] 3271; GFX8-NEXT: ; return to shader part epilog 3272; 3273; GFX9-LABEL: s_usubsat_v2i128: 3274; GFX9: ; %bb.0: 3275; GFX9-NEXT: s_sub_u32 s0, s0, s8 3276; GFX9-NEXT: s_subb_u32 s1, s1, s9 3277; GFX9-NEXT: s_subb_u32 s2, s2, s10 3278; GFX9-NEXT: s_subb_u32 s3, s3, s11 3279; GFX9-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] 3280; GFX9-NEXT: s_cselect_b64 s[2:3], 0, s[2:3] 3281; GFX9-NEXT: s_sub_u32 s4, s4, s12 3282; GFX9-NEXT: s_subb_u32 s5, s5, s13 3283; GFX9-NEXT: s_subb_u32 s6, s6, s14 3284; GFX9-NEXT: s_subb_u32 s7, s7, s15 3285; GFX9-NEXT: s_cselect_b64 s[4:5], 0, s[4:5] 3286; GFX9-NEXT: s_cselect_b64 s[6:7], 0, s[6:7] 3287; GFX9-NEXT: ; return to shader part epilog 3288; 3289; GFX10PLUS-LABEL: s_usubsat_v2i128: 3290; GFX10PLUS: ; %bb.0: 3291; GFX10PLUS-NEXT: s_sub_u32 s0, s0, s8 3292; GFX10PLUS-NEXT: s_subb_u32 s1, s1, s9 3293; GFX10PLUS-NEXT: s_subb_u32 s2, s2, s10 3294; GFX10PLUS-NEXT: s_subb_u32 s3, s3, s11 3295; GFX10PLUS-NEXT: s_cselect_b64 s[0:1], 0, s[0:1] 3296; GFX10PLUS-NEXT: s_cselect_b64 s[2:3], 0, s[2:3] 3297; GFX10PLUS-NEXT: s_sub_u32 s4, s4, s12 3298; GFX10PLUS-NEXT: s_subb_u32 s5, s5, s13 3299; GFX10PLUS-NEXT: s_subb_u32 s6, s6, s14 3300; GFX10PLUS-NEXT: s_subb_u32 s7, s7, s15 3301; GFX10PLUS-NEXT: s_cselect_b64 s[4:5], 0, s[4:5] 3302; GFX10PLUS-NEXT: s_cselect_b64 s[6:7], 0, s[6:7] 3303; GFX10PLUS-NEXT: ; return to shader part epilog 3304 %result = call <2 x i128> @llvm.usub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) 3305 ret <2 x i128> %result 3306} 3307 3308declare i7 @llvm.usub.sat.i7(i7, i7) #0 3309declare i8 @llvm.usub.sat.i8(i8, i8) #0 3310declare <2 x i8> @llvm.usub.sat.v2i8(<2 x i8>, <2 x i8>) #0 3311declare <4 x i8> @llvm.usub.sat.v4i8(<4 x i8>, <4 x i8>) #0 3312 3313declare i16 @llvm.usub.sat.i16(i16, i16) #0 3314declare <2 x i16> @llvm.usub.sat.v2i16(<2 x i16>, <2 x i16>) #0 3315declare <3 x i16> @llvm.usub.sat.v3i16(<3 x i16>, <3 x i16>) #0 3316declare <4 x i16> @llvm.usub.sat.v4i16(<4 x i16>, <4 x i16>) #0 3317declare <5 x i16> @llvm.usub.sat.v5i16(<5 x i16>, <5 x i16>) #0 3318declare <6 x i16> @llvm.usub.sat.v6i16(<6 x i16>, <6 x i16>) #0 3319declare <8 x i16> @llvm.usub.sat.v8i16(<8 x i16>, <8 x i16>) #0 3320 3321declare i24 @llvm.usub.sat.i24(i24, i24) #0 3322 3323declare i32 @llvm.usub.sat.i32(i32, i32) #0 3324declare <2 x i32> @llvm.usub.sat.v2i32(<2 x i32>, <2 x i32>) #0 3325declare <3 x i32> @llvm.usub.sat.v3i32(<3 x i32>, <3 x i32>) #0 3326declare <4 x i32> @llvm.usub.sat.v4i32(<4 x i32>, <4 x i32>) #0 3327declare <5 x i32> @llvm.usub.sat.v5i32(<5 x i32>, <5 x i32>) #0 3328declare <16 x i32> @llvm.usub.sat.v16i32(<16 x i32>, <16 x i32>) #0 3329 3330declare i48 @llvm.usub.sat.i48(i48, i48) #0 3331 3332declare i64 @llvm.usub.sat.i64(i64, i64) #0 3333declare <2 x i64> @llvm.usub.sat.v2i64(<2 x i64>, <2 x i64>) #0 3334 3335declare i128 @llvm.usub.sat.i128(i128, i128) #0 3336declare <2 x i128> @llvm.usub.sat.v2i128(<2 x i128>, <2 x i128>) #0 3337 3338attributes #0 = { nounwind readnone speculatable willreturn } 3339