1divert(-1) 2 3 4dnl m4 macros for x86 assembler. 5 6 7dnl Copyright 1999, 2000, 2001, 2002, 2003, 2007 Free Software Foundation, 8dnl Inc. 9dnl 10dnl This file is part of the GNU MP Library. 11dnl 12dnl The GNU MP Library is free software; you can redistribute it and/or 13dnl modify it under the terms of the GNU Lesser General Public License as 14dnl published by the Free Software Foundation; either version 3 of the 15dnl License, or (at your option) any later version. 16dnl 17dnl The GNU MP Library is distributed in the hope that it will be useful, 18dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 19dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20dnl Lesser General Public License for more details. 21dnl 22dnl You should have received a copy of the GNU Lesser General Public License 23dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 24 25 26dnl Notes: 27dnl 28dnl m4 isn't perfect for processing BSD style x86 assembler code, the main 29dnl problems are, 30dnl 31dnl 1. Doing define(foo,123) and then using foo in an addressing mode like 32dnl foo(%ebx) expands as a macro rather than a constant. This is worked 33dnl around by using deflit() from asm-defs.m4, instead of define(). 34dnl 35dnl 2. Immediates in macro definitions need a space or `' to stop the $ 36dnl looking like a macro parameter. For example, 37dnl 38dnl define(foo, `mov $ 123, %eax') 39dnl 40dnl This is only a problem in macro definitions, not in ordinary text, 41dnl and not in macro parameters like text passed to forloop() or ifdef(). 42 43 44deflit(BYTES_PER_MP_LIMB, 4) 45 46 47dnl Libtool gives -DPIC -DDLL_EXPORT to indicate a cygwin or mingw DLL. We 48dnl undefine PIC since we don't need to be position independent in this 49dnl case and definitely don't want the ELF style _GLOBAL_OFFSET_TABLE_ etc. 50 51ifdef(`DLL_EXPORT',`undefine(`PIC')') 52 53 54dnl Usage: CPUVEC_FUNCS_LIST 55dnl 56dnl A list of the functions from gmp-impl.h x86 struct cpuvec_t, in the 57dnl order they appear in that structure. 58 59define(CPUVEC_FUNCS_LIST, 60``add_n', 61`addmul_1', 62`copyd', 63`copyi', 64`divexact_1', 65`divexact_by3c', 66`divrem_1', 67`gcd_1', 68`lshift', 69`mod_1', 70`mod_34lsub1', 71`modexact_1c_odd', 72`mul_1', 73`mul_basecase', 74`preinv_divrem_1', 75`preinv_mod_1', 76`rshift', 77`sqr_basecase', 78`sub_n', 79`submul_1'') 80 81 82dnl Called: PROLOGUE_cpu(GSYM_PREFIX`'foo) 83dnl 84dnl In the x86 code we use explicit TEXT and ALIGN() calls in the code, 85dnl since different alignments are wanted in various circumstances. So for 86dnl instance, 87dnl 88dnl TEXT 89dnl ALIGN(16) 90dnl PROLOGUE(mpn_add_n) 91dnl ... 92dnl EPILOGUE() 93 94define(`PROLOGUE_cpu', 95m4_assert_numargs(1) 96m4_assert_defined(`WANT_PROFILING') 97 `GLOBL $1 98 TYPE($1,`function') 99 COFF_TYPE($1) 100$1: 101ifelse(WANT_PROFILING,`prof', ` call_mcount') 102ifelse(WANT_PROFILING,`gprof', ` call_mcount') 103ifelse(WANT_PROFILING,`instrument',` call_instrument(enter)') 104') 105 106 107dnl Usage: COFF_TYPE(GSYM_PREFIX`'foo) 108dnl 109dnl Emit COFF style ".def ... .endef" type information for a function, when 110dnl supported. The argument should include any GSYM_PREFIX. 111dnl 112dnl See autoconf macro GMP_ASM_COFF_TYPE for HAVE_COFF_TYPE. 113 114define(COFF_TYPE, 115m4_assert_numargs(1) 116m4_assert_defined(`HAVE_COFF_TYPE') 117`ifelse(HAVE_COFF_TYPE,yes, 118 `.def $1 119 .scl 2 120 .type 32 121 .endef')') 122 123 124dnl Usage: call_mcount 125dnl 126dnl For `gprof' style profiling, %ebp is setup as a frame pointer. None of 127dnl the assembler routines use %ebp this way, so it's done only for the 128dnl benefit of mcount. glibc sysdeps/i386/i386-mcount.S shows how mcount 129dnl gets the current function from (%esp) and the parent from 4(%ebp). 130dnl 131dnl For `prof' style profiling gcc generates mcount calls without setting 132dnl up %ebp, and the same is done here. 133 134define(`call_mcount', 135m4_assert_numargs(-1) 136m4_assert_defined(`WANT_PROFILING') 137m4_assert_defined(`MCOUNT_PIC_REG') 138m4_assert_defined(`MCOUNT_NONPIC_REG') 139m4_assert_defined(`MCOUNT_PIC_CALL') 140m4_assert_defined(`MCOUNT_NONPIC_CALL') 141`ifelse(ifdef(`PIC',`MCOUNT_PIC_REG',`MCOUNT_NONPIC_REG'),,, 142` DATA 143 ALIGN(4) 144L(mcount_data_`'mcount_counter): 145 W32 0 146 TEXT 147')dnl 148ifelse(WANT_PROFILING,`gprof', 149` pushl %ebp 150 movl %esp, %ebp 151')dnl 152ifdef(`PIC', 153` pushl %ebx 154 call_movl_eip_to_ebx 155L(mcount_here_`'mcount_counter): 156 addl $_GLOBAL_OFFSET_TABLE_+[.-L(mcount_here_`'mcount_counter)], %ebx 157ifelse(MCOUNT_PIC_REG,,, 158` leal L(mcount_data_`'mcount_counter)@GOTOFF(%ebx), MCOUNT_PIC_REG') 159MCOUNT_PIC_CALL 160 popl %ebx 161',`dnl non-PIC 162ifelse(MCOUNT_NONPIC_REG,,, 163` movl `$'L(mcount_data_`'mcount_counter), MCOUNT_NONPIC_REG 164')dnl 165MCOUNT_NONPIC_CALL 166')dnl 167ifelse(WANT_PROFILING,`gprof', 168` popl %ebp 169') 170define(`mcount_counter',incr(mcount_counter)) 171') 172 173define(mcount_counter,1) 174 175 176dnl Usage: call_instrument(enter|exit) 177dnl 178dnl Call __cyg_profile_func_enter or __cyg_profile_func_exit. 179dnl 180dnl For PIC, most routines don't require _GLOBAL_OFFSET_TABLE_ themselves 181dnl so %ebx is just setup for these calls. It's a bit wasteful to repeat 182dnl the setup for the exit call having done it earlier for the enter, but 183dnl there's nowhere very convenient to hold %ebx through the length of a 184dnl routine, in general. 185dnl 186dnl For PIC, because instrument_current_function will be within the current 187dnl object file we can get it just as an offset from %eip, there's no need 188dnl to use the GOT. 189dnl 190dnl No attempt is made to maintain the stack alignment gcc generates with 191dnl -mpreferred-stack-boundary. This wouldn't be hard, but it seems highly 192dnl unlikely the instrumenting functions would be doing anything that'd 193dnl benefit from alignment, in particular they're unlikely to be using 194dnl doubles or long doubles on the stack. 195dnl 196dnl The FRAME scheme is used to conveniently account for the register saves 197dnl before accessing the return address. Any previous value is saved and 198dnl restored, since plenty of code keeps a value across a "ret" in the 199dnl middle of a routine. 200 201define(call_instrument, 202m4_assert_numargs(1) 203` pushdef(`FRAME',0) 204ifelse($1,exit, 205` pushl %eax FRAME_pushl() C return value 206') 207ifdef(`PIC', 208` pushl %ebx FRAME_pushl() 209 call_movl_eip_to_ebx 210L(instrument_here_`'instrument_count): 211 movl %ebx, %ecx 212 addl $_GLOBAL_OFFSET_TABLE_+[.-L(instrument_here_`'instrument_count)], %ebx 213 C use addl rather than leal to avoid old gas bugs, see mpn/x86/README 214 addl $instrument_current_function-L(instrument_here_`'instrument_count), %ecx 215 pushl m4_empty_if_zero(FRAME)(%esp) FRAME_pushl() C return addr 216 pushl %ecx FRAME_pushl() C this function 217 call GSYM_PREFIX`'__cyg_profile_func_$1@PLT 218 addl $`'8, %esp 219 popl %ebx 220', 221` C non-PIC 222 pushl m4_empty_if_zero(FRAME)(%esp) FRAME_pushl() C return addr 223 pushl $instrument_current_function FRAME_pushl() C this function 224 call GSYM_PREFIX`'__cyg_profile_func_$1 225 addl $`'8, %esp 226') 227ifelse($1,exit, 228` popl %eax C return value 229') 230 popdef(`FRAME') 231define(`instrument_count',incr(instrument_count)) 232') 233define(instrument_count,1) 234 235 236dnl Usage: instrument_current_function 237dnl 238dnl Return the current function name for instrumenting purposes. This is 239dnl PROLOGUE_current_function, but it sticks at the first such name seen. 240dnl 241dnl Sticking to the first name seen ensures that multiple-entrypoint 242dnl functions like mpn_add_nc and mpn_add_n will make enter and exit calls 243dnl giving the same function address. 244 245define(instrument_current_function, 246m4_assert_numargs(-1) 247`ifdef(`instrument_current_function_seen', 248`instrument_current_function_seen', 249`define(`instrument_current_function_seen',PROLOGUE_current_function)dnl 250PROLOGUE_current_function')') 251 252 253dnl Usage: call_movl_eip_to_ebx 254dnl 255dnl Generate a call to L(movl_eip_to_ebx), and record the need for that 256dnl routine. 257 258define(call_movl_eip_to_ebx, 259m4_assert_numargs(-1) 260`call L(movl_eip_to_ebx) 261define(`movl_eip_to_ebx_needed',1)') 262 263dnl Usage: generate_movl_eip_to_ebx 264dnl 265dnl Emit a L(movl_eip_to_ebx) routine, if needed and not already generated. 266 267define(generate_movl_eip_to_ebx, 268m4_assert_numargs(-1) 269`ifelse(movl_eip_to_ebx_needed,1, 270`ifelse(movl_eip_to_ebx_done,1,, 271`L(movl_eip_to_ebx): 272 movl (%esp), %ebx 273 ret_internal 274define(`movl_eip_to_ebx_done',1) 275')')') 276 277 278dnl Usage: ret 279dnl 280dnl Generate a "ret", but if doing instrumented profiling then call 281dnl __cyg_profile_func_exit first. 282 283define(ret, 284m4_assert_numargs(-1) 285m4_assert_defined(`WANT_PROFILING') 286`ifelse(WANT_PROFILING,instrument, 287`ret_instrument', 288`ret_internal') 289generate_movl_eip_to_ebx 290') 291 292 293dnl Usage: ret_internal 294dnl 295dnl A plain "ret", without any __cyg_profile_func_exit call. This can be 296dnl used for a return which is internal to some function, such as when 297dnl getting %eip for PIC. 298 299define(ret_internal, 300m4_assert_numargs(-1) 301``ret'') 302 303 304dnl Usage: ret_instrument 305dnl 306dnl Generate call to __cyg_profile_func_exit and then a ret. If a ret has 307dnl already been seen from this function then jump to that chunk of code, 308dnl rather than emitting it again. 309 310define(ret_instrument, 311m4_assert_numargs(-1) 312`ifelse(m4_unquote(ret_instrument_seen_`'instrument_current_function),1, 313`jmp L(instrument_exit_`'instrument_current_function)', 314`define(ret_instrument_seen_`'instrument_current_function,1) 315L(instrument_exit_`'instrument_current_function): 316call_instrument(exit) 317 ret_internal')') 318 319 320dnl Usage: _GLOBAL_OFFSET_TABLE_ 321dnl 322dnl Expand to _GLOBAL_OFFSET_TABLE_ plus any necessary underscore prefix. 323dnl This lets us write plain _GLOBAL_OFFSET_TABLE_ in SVR4 style, but still 324dnl work with systems requiring an extra underscore such as OpenBSD. 325dnl 326dnl deflit is used so "leal _GLOBAL_OFFSET_TABLE_(%eax), %ebx" will come 327dnl out right, though that form doesn't work properly in gas (see 328dnl mpn/x86/README). 329 330deflit(_GLOBAL_OFFSET_TABLE_, 331m4_assert_defined(`GOT_GSYM_PREFIX') 332`GOT_GSYM_PREFIX`_GLOBAL_OFFSET_TABLE_'') 333 334 335dnl -------------------------------------------------------------------------- 336dnl Various x86 macros. 337dnl 338 339 340dnl Usage: ALIGN_OFFSET(bytes,offset) 341dnl 342dnl Align to `offset' away from a multiple of `bytes'. 343dnl 344dnl This is useful for testing, for example align to something very strict 345dnl and see what effect offsets from it have, "ALIGN_OFFSET(256,32)". 346dnl 347dnl Generally you wouldn't execute across the padding, but it's done with 348dnl nop's so it'll work. 349 350define(ALIGN_OFFSET, 351m4_assert_numargs(2) 352`ALIGN($1) 353forloop(`i',1,$2,` nop 354')') 355 356 357dnl Usage: defframe(name,offset) 358dnl 359dnl Make a definition like the following with which to access a parameter 360dnl or variable on the stack. 361dnl 362dnl define(name,`FRAME+offset(%esp)') 363dnl 364dnl Actually m4_empty_if_zero(FRAME+offset) is used, which will save one 365dnl byte if FRAME+offset is zero, by putting (%esp) rather than 0(%esp). 366dnl Use define(`defframe_empty_if_zero_disabled',1) if for some reason the 367dnl zero offset is wanted. 368dnl 369dnl The new macro also gets a check that when it's used FRAME is actually 370dnl defined, and that the final %esp offset isn't negative, which would 371dnl mean an attempt to access something below the current %esp. 372dnl 373dnl deflit() is used rather than a plain define(), so the new macro won't 374dnl delete any following parenthesized expression. name(%edi) will come 375dnl out say as 16(%esp)(%edi). This isn't valid assembler and should 376dnl provoke an error, which is better than silently giving just 16(%esp). 377dnl 378dnl See README for more on the suggested way to access the stack frame. 379 380define(defframe, 381m4_assert_numargs(2) 382`deflit(`$1', 383m4_assert_defined(`FRAME') 384`defframe_check_notbelow(`$1',$2,FRAME)dnl 385defframe_empty_if_zero(FRAME+($2))(%esp)')') 386 387dnl Called: defframe_empty_if_zero(expression) 388define(defframe_empty_if_zero, 389m4_assert_numargs(1) 390`ifelse(defframe_empty_if_zero_disabled,1, 391`eval($1)', 392`m4_empty_if_zero($1)')') 393 394dnl Called: defframe_check_notbelow(`name',offset,FRAME) 395define(defframe_check_notbelow, 396m4_assert_numargs(3) 397`ifelse(eval(($3)+($2)<0),1, 398`m4_error(`$1 at frame offset $2 used when FRAME is only $3 bytes 399')')') 400 401 402dnl Usage: FRAME_pushl() 403dnl FRAME_popl() 404dnl FRAME_addl_esp(n) 405dnl FRAME_subl_esp(n) 406dnl 407dnl Adjust FRAME appropriately for a pushl or popl, or for an addl or subl 408dnl %esp of n bytes. 409dnl 410dnl Using these macros is completely optional. Sometimes it makes more 411dnl sense to put explicit deflit(`FRAME',N) forms, especially when there's 412dnl jumps and different sequences of FRAME values need to be used in 413dnl different places. 414 415define(FRAME_pushl, 416m4_assert_numargs(0) 417m4_assert_defined(`FRAME') 418`deflit(`FRAME',eval(FRAME+4))') 419 420define(FRAME_popl, 421m4_assert_numargs(0) 422m4_assert_defined(`FRAME') 423`deflit(`FRAME',eval(FRAME-4))') 424 425define(FRAME_addl_esp, 426m4_assert_numargs(1) 427m4_assert_defined(`FRAME') 428`deflit(`FRAME',eval(FRAME-($1)))') 429 430define(FRAME_subl_esp, 431m4_assert_numargs(1) 432m4_assert_defined(`FRAME') 433`deflit(`FRAME',eval(FRAME+($1)))') 434 435 436dnl Usage: defframe_pushl(name) 437dnl 438dnl Do a combination FRAME_pushl() and a defframe() to name the stack 439dnl location just pushed. This should come after a pushl instruction. 440dnl Putting it on the same line works and avoids lengthening the code. For 441dnl example, 442dnl 443dnl pushl %eax defframe_pushl(VAR_COUNTER) 444dnl 445dnl Notice the defframe() is done with an unquoted -FRAME thus giving its 446dnl current value without tracking future changes. 447 448define(defframe_pushl, 449m4_assert_numargs(1) 450`FRAME_pushl()defframe(`$1',-FRAME)') 451 452 453dnl -------------------------------------------------------------------------- 454dnl Assembler instruction macros. 455dnl 456 457 458dnl Usage: emms_or_femms 459dnl femms_available_p 460dnl 461dnl femms_available_p expands to 1 or 0 according to whether the AMD 3DNow 462dnl femms instruction is available. emms_or_femms expands to femms if 463dnl available, or emms if not. 464dnl 465dnl emms_or_femms is meant for use in the K6 directory where plain K6 466dnl (without femms) and K6-2 and K6-3 (with a slightly faster femms) are 467dnl supported together. 468dnl 469dnl On K7 femms is no longer faster and is just an alias for emms, so plain 470dnl emms may as well be used. 471 472define(femms_available_p, 473m4_assert_numargs(-1) 474`m4_ifdef_anyof_p( 475 `HAVE_HOST_CPU_k62', 476 `HAVE_HOST_CPU_k63', 477 `HAVE_HOST_CPU_athlon')') 478 479define(emms_or_femms, 480m4_assert_numargs(-1) 481`ifelse(femms_available_p,1,`femms',`emms')') 482 483 484dnl Usage: femms 485dnl 486dnl Gas 2.9.1 which comes with FreeBSD 3.4 doesn't support femms, so the 487dnl following is a replacement using .byte. 488 489define(femms, 490m4_assert_numargs(-1) 491`.byte 15,14 C AMD 3DNow femms') 492 493 494dnl Usage: jadcl0(op) 495dnl 496dnl Generate a jnc/incl as a substitute for adcl $0,op. Note this isn't an 497dnl exact replacement, since it doesn't set the flags like adcl does. 498dnl 499dnl This finds a use in K6 mpn_addmul_1, mpn_submul_1, mpn_mul_basecase and 500dnl mpn_sqr_basecase because on K6 an adcl is slow, the branch 501dnl misprediction penalty is small, and the multiply algorithm used leads 502dnl to a carry bit on average only 1/4 of the time. 503dnl 504dnl jadcl0_disabled can be set to 1 to instead generate an ordinary adcl 505dnl for comparison. For example, 506dnl 507dnl define(`jadcl0_disabled',1) 508dnl 509dnl When using a register operand, eg. "jadcl0(%edx)", the jnc/incl code is 510dnl the same size as an adcl. This makes it possible to use the exact same 511dnl computed jump code when testing the relative speed of the two. 512 513define(jadcl0, 514m4_assert_numargs(1) 515`ifelse(jadcl0_disabled,1, 516 `adcl $`'0, $1', 517 `jnc L(jadcl0_`'jadcl0_counter) 518 incl $1 519L(jadcl0_`'jadcl0_counter): 520define(`jadcl0_counter',incr(jadcl0_counter))')') 521 522define(jadcl0_counter,1) 523 524 525dnl Usage: x86_lookup(target, key,value, key,value, ...) 526dnl x86_lookup_p(target, key,value, key,value, ...) 527dnl 528dnl Look for `target' among the `key' parameters. 529dnl 530dnl x86_lookup expands to the corresponding `value', or generates an error 531dnl if `target' isn't found. 532dnl 533dnl x86_lookup_p expands to 1 if `target' is found, or 0 if not. 534 535define(x86_lookup, 536m4_assert_numargs_range(1,999) 537`ifelse(eval($#<3),1, 538`m4_error(`unrecognised part of x86 instruction: $1 539')', 540`ifelse(`$1',`$2', `$3', 541`x86_lookup(`$1',shift(shift(shift($@))))')')') 542 543define(x86_lookup_p, 544m4_assert_numargs_range(1,999) 545`ifelse(eval($#<3),1, `0', 546`ifelse(`$1',`$2', `1', 547`x86_lookup_p(`$1',shift(shift(shift($@))))')')') 548 549 550dnl Usage: x86_opcode_reg32(reg) 551dnl x86_opcode_reg32_p(reg) 552dnl 553dnl x86_opcode_reg32 expands to the standard 3 bit encoding for the given 554dnl 32-bit register, eg. `%ebp' turns into 5. 555dnl 556dnl x86_opcode_reg32_p expands to 1 if reg is a valid 32-bit register, or 0 557dnl if not. 558 559define(x86_opcode_reg32, 560m4_assert_numargs(1) 561`x86_lookup(`$1',x86_opcode_reg32_list)') 562 563define(x86_opcode_reg32_p, 564m4_assert_onearg() 565`x86_lookup_p(`$1',x86_opcode_reg32_list)') 566 567define(x86_opcode_reg32_list, 568``%eax',0, 569`%ecx',1, 570`%edx',2, 571`%ebx',3, 572`%esp',4, 573`%ebp',5, 574`%esi',6, 575`%edi',7') 576 577 578dnl Usage: x86_opcode_tttn(cond) 579dnl 580dnl Expand to the 4-bit "tttn" field value for the given x86 branch 581dnl condition (like `c', `ae', etc). 582 583define(x86_opcode_tttn, 584m4_assert_numargs(1) 585`x86_lookup(`$1',x86_opcode_ttn_list)') 586 587define(x86_opcode_tttn_list, 588``o', 0, 589`no', 1, 590`b', 2, `c', 2, `nae',2, 591`nb', 3, `nc', 3, `ae', 3, 592`e', 4, `z', 4, 593`ne', 5, `nz', 5, 594`be', 6, `na', 6, 595`nbe', 7, `a', 7, 596`s', 8, 597`ns', 9, 598`p', 10, `pe', 10, `npo',10, 599`np', 11, `npe',11, `po', 11, 600`l', 12, `nge',12, 601`nl', 13, `ge', 13, 602`le', 14, `ng', 14, 603`nle',15, `g', 15') 604 605 606dnl Usage: cmovCC(%srcreg,%dstreg) 607dnl 608dnl Emit a cmov instruction, using a .byte sequence, since various past 609dnl versions of gas don't know cmov. For example, 610dnl 611dnl cmovz( %eax, %ebx) 612dnl 613dnl The source operand can only be a plain register. (m4 code implementing 614dnl full memory addressing modes exists, believe it or not, but isn't 615dnl currently needed and isn't included.) 616dnl 617dnl All the standard conditions are defined. Attempting to use one without 618dnl the macro parentheses, such as just "cmovbe %eax, %ebx", will provoke 619dnl an error. This protects against writing something old gas wouldn't 620dnl understand. 621 622dnl Called: define_cmov_many(cond,tttn,cond,tttn,...) 623define(define_cmov_many, 624`ifelse(m4_length(`$1'),0,, 625`define_cmov(`$1',`$2')define_cmov_many(shift(shift($@)))')') 626 627dnl Called: define_cmov(cond,tttn) 628dnl Emit basically define(cmov<cond>,`cmov_internal(<cond>,<ttn>,`$1',`$2')') 629define(define_cmov, 630m4_assert_numargs(2) 631`define(`cmov$1', 632m4_instruction_wrapper() 633m4_assert_numargs(2) 634`cmov_internal'(m4_doublequote($`'0),``$2'',dnl 635m4_doublequote($`'1),m4_doublequote($`'2)))') 636 637define_cmov_many(x86_opcode_tttn_list) 638 639dnl Called: cmov_internal(name,tttn,src,dst) 640define(cmov_internal, 641m4_assert_numargs(4) 642`.byte dnl 64315, dnl 644eval(64+$2), dnl 645eval(192+8*x86_opcode_reg32(`$4')+x86_opcode_reg32(`$3')) dnl 646 C `$1 $3, $4'') 647 648 649dnl Usage: x86_opcode_regmmx(reg) 650dnl 651dnl Validate the given mmx register, and return its number, 0 to 7. 652 653define(x86_opcode_regmmx, 654m4_assert_numargs(1) 655`x86_lookup(`$1',x86_opcode_regmmx_list)') 656 657define(x86_opcode_regmmx_list, 658``%mm0',0, 659`%mm1',1, 660`%mm2',2, 661`%mm3',3, 662`%mm4',4, 663`%mm5',5, 664`%mm6',6, 665`%mm7',7') 666 667 668dnl Usage: psadbw(%srcreg,%dstreg) 669dnl 670dnl Oldish versions of gas don't know psadbw, in particular gas 2.9.1 on 671dnl FreeBSD 3.3 and 3.4 doesn't, so instead emit .byte sequences. For 672dnl example, 673dnl 674dnl psadbw( %mm1, %mm2) 675dnl 676dnl Only register->register forms are supported here, which suffices for 677dnl the current code. 678 679define(psadbw, 680m4_instruction_wrapper() 681m4_assert_numargs(2) 682`.byte 0x0f,0xf6,dnl 683eval(192+x86_opcode_regmmx(`$2')*8+x86_opcode_regmmx(`$1')) dnl 684 C `psadbw $1, $2'') 685 686 687dnl Usage: Zdisp(inst,op,op,op) 688dnl 689dnl Generate explicit .byte sequences if necessary to force a byte-sized 690dnl zero displacement on an instruction. For example, 691dnl 692dnl Zdisp( movl, 0,(%esi), %eax) 693dnl 694dnl expands to 695dnl 696dnl .byte 139,70,0 C movl 0(%esi), %eax 697dnl 698dnl If the displacement given isn't 0, then normal assembler code is 699dnl generated. For example, 700dnl 701dnl Zdisp( movl, 4,(%esi), %eax) 702dnl 703dnl expands to 704dnl 705dnl movl 4(%esi), %eax 706dnl 707dnl This means a single Zdisp() form can be used with an expression for the 708dnl displacement, and .byte will be used only if necessary. The 709dnl displacement argument is eval()ed. 710dnl 711dnl Because there aren't many places a 0(reg) form is wanted, Zdisp is 712dnl implemented with a table of instructions and encodings. A new entry is 713dnl needed for any different operation or registers. The table is split 714dnl into separate macros to avoid overflowing BSD m4 macro expansion space. 715 716define(Zdisp, 717m4_assert_numargs(4) 718`define(`Zdisp_found',0)dnl 719Zdisp_1($@)dnl 720Zdisp_2($@)dnl 721Zdisp_3($@)dnl 722Zdisp_4($@)dnl 723ifelse(Zdisp_found,0, 724`m4_error(`unrecognised instruction in Zdisp: $1 $2 $3 $4 725')')') 726 727define(Zdisp_1,`dnl 728Zdisp_match( adcl, 0,(%edx), %eax, `0x13,0x42,0x00', $@)`'dnl 729Zdisp_match( adcl, 0,(%edx), %ebx, `0x13,0x5a,0x00', $@)`'dnl 730Zdisp_match( adcl, 0,(%edx), %esi, `0x13,0x72,0x00', $@)`'dnl 731Zdisp_match( addl, %ebx, 0,(%edi), `0x01,0x5f,0x00', $@)`'dnl 732Zdisp_match( addl, %ecx, 0,(%edi), `0x01,0x4f,0x00', $@)`'dnl 733Zdisp_match( addl, %esi, 0,(%edi), `0x01,0x77,0x00', $@)`'dnl 734Zdisp_match( sbbl, 0,(%edx), %eax, `0x1b,0x42,0x00', $@)`'dnl 735Zdisp_match( sbbl, 0,(%edx), %esi, `0x1b,0x72,0x00', $@)`'dnl 736Zdisp_match( subl, %ecx, 0,(%edi), `0x29,0x4f,0x00', $@)`'dnl 737Zdisp_match( movzbl, 0,(%eax,%ebp), %eax, `0x0f,0xb6,0x44,0x28,0x00', $@)`'dnl 738Zdisp_match( movzbl, 0,(%ecx,%edi), %edi, `0x0f,0xb6,0x7c,0x39,0x00', $@)`'dnl 739Zdisp_match( adc, 0,(%ebx,%ecx,4), %eax, `0x13,0x44,0x8b,0x00', $@)`'dnl 740Zdisp_match( sbb, 0,(%ebx,%ecx,4), %eax, `0x1b,0x44,0x8b,0x00', $@)`'dnl 741') 742define(Zdisp_2,`dnl 743Zdisp_match( movl, %eax, 0,(%edi), `0x89,0x47,0x00', $@)`'dnl 744Zdisp_match( movl, %ebx, 0,(%edi), `0x89,0x5f,0x00', $@)`'dnl 745Zdisp_match( movl, %esi, 0,(%edi), `0x89,0x77,0x00', $@)`'dnl 746Zdisp_match( movl, 0,(%ebx), %eax, `0x8b,0x43,0x00', $@)`'dnl 747Zdisp_match( movl, 0,(%ebx), %esi, `0x8b,0x73,0x00', $@)`'dnl 748Zdisp_match( movl, 0,(%edx), %eax, `0x8b,0x42,0x00', $@)`'dnl 749Zdisp_match( movl, 0,(%esi), %eax, `0x8b,0x46,0x00', $@)`'dnl 750Zdisp_match( movl, 0,(%esi,%ecx,4), %eax, `0x8b,0x44,0x8e,0x00', $@)`'dnl 751Zdisp_match( mov, 0,(%esi,%ecx,4), %eax, `0x8b,0x44,0x8e,0x00', $@)`'dnl 752Zdisp_match( mov, %eax, 0,(%edi,%ecx,4), `0x89,0x44,0x8f,0x00', $@)`'dnl 753') 754define(Zdisp_3,`dnl 755Zdisp_match( movq, 0,(%eax,%ecx,8), %mm0, `0x0f,0x6f,0x44,0xc8,0x00', $@)`'dnl 756Zdisp_match( movq, 0,(%ebx,%eax,4), %mm0, `0x0f,0x6f,0x44,0x83,0x00', $@)`'dnl 757Zdisp_match( movq, 0,(%ebx,%eax,4), %mm2, `0x0f,0x6f,0x54,0x83,0x00', $@)`'dnl 758Zdisp_match( movq, 0,(%ebx,%ecx,4), %mm0, `0x0f,0x6f,0x44,0x8b,0x00', $@)`'dnl 759Zdisp_match( movq, 0,(%edx), %mm0, `0x0f,0x6f,0x42,0x00', $@)`'dnl 760Zdisp_match( movq, 0,(%esi), %mm0, `0x0f,0x6f,0x46,0x00', $@)`'dnl 761Zdisp_match( movq, %mm0, 0,(%edi), `0x0f,0x7f,0x47,0x00', $@)`'dnl 762Zdisp_match( movq, %mm2, 0,(%ecx,%eax,4), `0x0f,0x7f,0x54,0x81,0x00', $@)`'dnl 763Zdisp_match( movq, %mm2, 0,(%edx,%eax,4), `0x0f,0x7f,0x54,0x82,0x00', $@)`'dnl 764Zdisp_match( movq, %mm0, 0,(%edx,%ecx,8), `0x0f,0x7f,0x44,0xca,0x00', $@)`'dnl 765') 766define(Zdisp_4,`dnl 767Zdisp_match( movd, 0,(%eax,%ecx,4), %mm0, `0x0f,0x6e,0x44,0x88,0x00', $@)`'dnl 768Zdisp_match( movd, 0,(%eax,%ecx,8), %mm1, `0x0f,0x6e,0x4c,0xc8,0x00', $@)`'dnl 769Zdisp_match( movd, 0,(%edx,%ecx,8), %mm0, `0x0f,0x6e,0x44,0xca,0x00', $@)`'dnl 770Zdisp_match( movd, %mm0, 0,(%eax,%ecx,4), `0x0f,0x7e,0x44,0x88,0x00', $@)`'dnl 771Zdisp_match( movd, %mm0, 0,(%ecx,%eax,4), `0x0f,0x7e,0x44,0x81,0x00', $@)`'dnl 772Zdisp_match( movd, %mm2, 0,(%ecx,%eax,4), `0x0f,0x7e,0x54,0x81,0x00', $@)`'dnl 773Zdisp_match( movd, %mm0, 0,(%edx,%ecx,4), `0x0f,0x7e,0x44,0x8a,0x00', $@)`'dnl 774') 775 776define(Zdisp_match, 777m4_assert_numargs(9) 778`ifelse(eval(m4_stringequal_p(`$1',`$6') 779 && m4_stringequal_p(`$2',0) 780 && m4_stringequal_p(`$3',`$8') 781 && m4_stringequal_p(`$4',`$9')),1, 782`define(`Zdisp_found',1)dnl 783ifelse(eval(`$7'),0, 784` .byte $5 C `$1 0$3, $4'', 785` $6 $7$8, $9')', 786 787`ifelse(eval(m4_stringequal_p(`$1',`$6') 788 && m4_stringequal_p(`$2',`$7') 789 && m4_stringequal_p(`$3',0) 790 && m4_stringequal_p(`$4',`$9')),1, 791`define(`Zdisp_found',1)dnl 792ifelse(eval(`$8'),0, 793` .byte $5 C `$1 $2, 0$4'', 794` $6 $7, $8$9')')')') 795 796 797dnl Usage: shldl(count,src,dst) 798dnl shrdl(count,src,dst) 799dnl shldw(count,src,dst) 800dnl shrdw(count,src,dst) 801dnl 802dnl Generate a double-shift instruction, possibly omitting a %cl count 803dnl parameter if that's what the assembler requires, as indicated by 804dnl WANT_SHLDL_CL in config.m4. For example, 805dnl 806dnl shldl( %cl, %eax, %ebx) 807dnl 808dnl turns into either 809dnl 810dnl shldl %cl, %eax, %ebx 811dnl or 812dnl shldl %eax, %ebx 813dnl 814dnl Immediate counts are always passed through unchanged. For example, 815dnl 816dnl shrdl( $2, %esi, %edi) 817dnl becomes 818dnl shrdl $2, %esi, %edi 819dnl 820dnl 821dnl If you forget to use the macro form "shldl( ...)" and instead write 822dnl just a plain "shldl ...", an error results. This ensures the necessary 823dnl variant treatment of %cl isn't accidentally bypassed. 824 825define(define_shd_instruction, 826m4_assert_numargs(1) 827`define($1, 828m4_instruction_wrapper() 829m4_assert_numargs(3) 830`shd_instruction'(m4_doublequote($`'0),m4_doublequote($`'1),dnl 831m4_doublequote($`'2),m4_doublequote($`'3)))') 832 833dnl Effectively: define(shldl,`shd_instruction(`$0',`$1',`$2',`$3')') etc 834define_shd_instruction(shldl) 835define_shd_instruction(shrdl) 836define_shd_instruction(shldw) 837define_shd_instruction(shrdw) 838 839dnl Called: shd_instruction(op,count,src,dst) 840define(shd_instruction, 841m4_assert_numargs(4) 842m4_assert_defined(`WANT_SHLDL_CL') 843`ifelse(eval(m4_stringequal_p(`$2',`%cl') && !WANT_SHLDL_CL),1, 844``$1' `$3', `$4'', 845``$1' `$2', `$3', `$4'')') 846 847 848dnl Usage: ASSERT([cond][,instructions]) 849dnl 850dnl If WANT_ASSERT is 1, output the given instructions and expect the given 851dnl flags condition to then be satisfied. For example, 852dnl 853dnl ASSERT(ne, `cmpl %eax, %ebx') 854dnl 855dnl The instructions can be omitted to just assert a flags condition with 856dnl no extra calculation. For example, 857dnl 858dnl ASSERT(nc) 859dnl 860dnl When `instructions' is not empty, a pushf/popf is added to preserve the 861dnl flags, but the instructions themselves must preserve any registers that 862dnl matter. FRAME is adjusted for the push and pop, so the instructions 863dnl given can use defframe() stack variables. 864dnl 865dnl The condition can be omitted to just output the given instructions when 866dnl assertion checking is wanted. In this case the pushf/popf is omitted. 867dnl For example, 868dnl 869dnl ASSERT(, `movl %eax, VAR_KEEPVAL') 870 871define(ASSERT, 872m4_assert_numargs_range(1,2) 873m4_assert_defined(`WANT_ASSERT') 874`ifelse(WANT_ASSERT,1, 875`ifelse(`$1',, 876 `$2', 877 `C ASSERT 878ifelse(`$2',,,` pushf ifdef(`FRAME',`FRAME_pushl()')') 879 $2 880 j`$1' L(ASSERT_ok`'ASSERT_counter) 881 ud2 C assertion failed 882L(ASSERT_ok`'ASSERT_counter): 883ifelse(`$2',,,` popf ifdef(`FRAME',`FRAME_popl()')') 884define(`ASSERT_counter',incr(ASSERT_counter))')')') 885 886define(ASSERT_counter,1) 887 888 889dnl Usage: movl_text_address(label,register) 890dnl 891dnl Get the address of a text segment label, using either a plain movl or a 892dnl position-independent calculation, as necessary. For example, 893dnl 894dnl movl_code_address(L(foo),%eax) 895dnl 896dnl This macro is only meant for use in ASSERT()s or when testing, since 897dnl the PIC sequence it generates will want to be done with a ret balancing 898dnl the call on CPUs with return address branch prediction. 899dnl 900dnl The addl generated here has a backward reference to the label, and so 901dnl won't suffer from the two forwards references bug in old gas (described 902dnl in mpn/x86/README). 903 904define(movl_text_address, 905m4_assert_numargs(2) 906`ifdef(`PIC', 907 `call L(movl_text_address_`'movl_text_address_counter) 908L(movl_text_address_`'movl_text_address_counter): 909 popl $2 C %eip 910 addl `$'$1-L(movl_text_address_`'movl_text_address_counter), $2 911define(`movl_text_address_counter',incr(movl_text_address_counter))', 912 `movl `$'$1, $2')') 913 914define(movl_text_address_counter,1) 915 916 917dnl Usage: notl_or_xorl_GMP_NUMB_MASK(reg) 918dnl 919dnl Expand to either "notl `reg'" or "xorl $GMP_NUMB_BITS,`reg'" as 920dnl appropriate for nails in use or not. 921 922define(notl_or_xorl_GMP_NUMB_MASK, 923m4_assert_numargs(1) 924`ifelse(GMP_NAIL_BITS,0, 925`notl `$1'', 926`xorl $GMP_NUMB_MASK, `$1'')') 927 928 929dnl Usage LEA(symbol,reg) 930 931define(`LEA',` 932define(`EPILOGUE_cpu', 933` 934L(movl_eip_`'substr($2,1)): 935 movl (%esp), $2 936 ret_internal 937 SIZE($'`1, .-$'`1)') 938 939 call L(movl_eip_`'substr($2,1)) 940 addl $_GLOBAL_OFFSET_TABLE_, $2 941 movl $1@GOT($2), $2 942') 943 944 945define(`DEF_OBJECT', 946m4_assert_numargs_range(1,2) 947 `RODATA 948 ALIGN(ifelse($#,1,2,$2)) 949$1: 950') 951 952define(`END_OBJECT', 953m4_assert_numargs(1) 954` SIZE(`$1',.-`$1')') 955 956divert`'dnl 957