| |
| # |
| # (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved. |
| # |
| # This file is part of libacml_mv. |
| # |
| # libacml_mv is free software; you can redistribute it and/or |
| # modify it under the terms of the GNU Lesser General Public |
| # License as published by the Free Software Foundation; either |
| # version 2.1 of the License, or (at your option) any later version. |
| # |
| # libacml_mv is distributed in the hope that it will be useful, |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| # Lesser General Public License for more details. |
| # |
| # You should have received a copy of the GNU Lesser General Public |
| # License along with libacml_mv. If not, see |
| # <http://www.gnu.org/licenses/>. |
| # |
| # |
| |
| |
| # An implementation of the cosf function. |
| # |
| # Prototype: |
| # |
| # float fastcosf(float x); |
| # |
| # Computes cosf(x). |
| # Based on the NAG C implementation. |
| # It will provide proper C99 return values, |
| # but may not raise floating point status bits properly. |
| # Author: Harsha Jagasia |
| # Email: harsha.jagasia@amd.com |
| |
| #ifdef __ELF__ |
| .section .note.GNU-stack,"",@progbits |
| #endif |
| |
| .data |
| .align 32 |
| .L__real_3ff0000000000000: .quad 0x03ff0000000000000 # 1.0 |
| .quad 0 # for alignment |
| .L__real_3fe0000000000000: .quad 0x03fe0000000000000 # 0.5 |
| .quad 0 |
| .L__real_3fc5555555555555: .quad 0x03fc5555555555555 # 0.166666666666 |
| .quad 0 |
| .L__real_3fe45f306dc9c883: .quad 0x03fe45f306dc9c883 # twobypi |
| .quad 0 |
| .L__real_3FF921FB54442D18: .quad 0x03FF921FB54442D18 # piby2 |
| .quad 0 |
| .L__real_3ff921fb54400000: .quad 0x03ff921fb54400000 # piby2_1 |
| .quad 0 |
| .L__real_3dd0b4611a626331: .quad 0x03dd0b4611a626331 # piby2_1tail |
| .quad 0 |
| .L__real_3dd0b4611a600000: .quad 0x03dd0b4611a600000 # piby2_2 |
| .quad 0 |
| .L__real_3ba3198a2e037073: .quad 0x03ba3198a2e037073 # piby2_2tail |
| .quad 0 |
| .L__real_411E848000000000: .quad 0x415312d000000000 # 5e6 0x0411E848000000000 # 5e5 |
| .quad 0 |
| |
| .align 32 |
| .Lcsarray: |
| .quad 0x0bfc5555555555555 # -0.166667 s1 |
| .quad 0x03fa5555555555555 # 0.0416667 c1 |
| .quad 0x03f81111111110bb3 # 0.00833333 s2 |
| .quad 0x0bf56c16c16c16967 # -0.00138889 c2 |
| .quad 0x0bf2a01a019e83e5c # -0.000198413 s3 |
| .quad 0x03efa01a019f4ec90 # 2.48016e-005 c3 |
| .quad 0x03ec71de3796cde01 # 2.75573e-006 s4 |
| .quad 0x0be927e4fa17f65f6 # -2.75573e-007 c4 |
| |
| .text |
| .align 32 |
| .p2align 5,,31 |
| |
| #include "fn_macros.h" |
| #define fname FN_PROTOTYPE(cosf) |
| #define fname_special _cosf_special@PLT |
| |
| # define local variable storage offsets |
| .equ p_temp, 0x30 # temporary for get/put bits operation |
| .equ p_temp1, 0x40 # temporary for get/put bits operation |
| .equ region, 0x50 # pointer to region for amd_remainder_piby2 |
| .equ r, 0x60 # pointer to r for amd_remainder_piby2 |
| .equ stack_size, 0x88 |
| |
| .globl fname |
| .type fname,@function |
| |
| fname: |
| |
| sub $stack_size, %rsp |
| |
| ## if NaN or inf |
| movd %xmm0, %edx |
| mov $0x07f800000, %eax |
| mov %eax, %r10d |
| and %edx, %r10d |
| cmp %eax, %r10d |
| jz .Lcosf_naninf |
| |
| xorpd %xmm2, %xmm2 |
| mov %rdx, %r11 # save 1st return value pointer |
| |
| # GET_BITS_DP64(x, ux); |
| # convert input to double. |
| cvtss2sd %xmm0, %xmm0 |
| |
| # get the input value to an integer register. |
| movsd %xmm0,p_temp(%rsp) |
| mov p_temp(%rsp), %rdx # rdx is ux |
| |
| # ax = (ux & ~SIGNBIT_DP64); |
| mov $0x07fffffffffffffff, %r10 |
| and %rdx, %r10 # r10 is ax |
| |
| mov $1, %r8d # for determining region later on |
| movsd %xmm0, %xmm1 # copy x to xmm1 |
| |
| |
| ## if (ax <= 0x3fe921fb54442d18) /* abs(x) <= pi/4 */ |
| mov $0x03fe921fb54442d18, %rax |
| cmp %rax, %r10 |
| jg .L__sc_reducec |
| |
| # *c = cos_piby4(x, 0.0); |
| movsd %xmm0, %xmm2 |
| mulsd %xmm2, %xmm2 # x^2 |
| xor %eax, %eax |
| mov %r10, %rdx |
| movsd .L__real_3fe0000000000000(%rip), %xmm5 # .5 |
| jmp .L__sc_piby4c |
| |
| .align 32 |
| .L__sc_reducec: |
| # reduce the argument to be in a range from -pi/4 to +pi/4 |
| # by subtracting multiples of pi/2 |
| # xneg = (ax != ux); |
| cmp %r10, %rdx |
| ## if (xneg) x = -x; |
| jz .Lpositive |
| subsd %xmm0, %xmm2 |
| movsd %xmm2, %xmm0 |
| |
| .Lpositive: |
| ## if (x < 5.0e5) |
| cmp .L__real_411E848000000000(%rip), %r10 |
| jae .Lcosf_reduce_precise |
| |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| |
| # perform taylor series to calc cosx, cosx |
| # xmm0=abs(x), xmm1=x |
| .align 32 |
| .Lcosf_piby4: |
| #/* How many pi/2 is x a multiple of? */ |
| # npi2 = (int)(x * twobypi + 0.5); |
| |
| movsd %xmm0, %xmm2 |
| movsd %xmm0, %xmm4 |
| |
| mulsd .L__real_3fe45f306dc9c883(%rip), %xmm2 # twobypi |
| movsd .L__real_3fe0000000000000(%rip), %xmm5 # .5 |
| |
| #/* How many pi/2 is x a multiple of? */ |
| |
| # xexp = ax >> EXPSHIFTBITS_DP64; |
| mov %r10, %r9 |
| shr $52, %r9 # >> EXPSHIFTBITS_DP64 |
| |
| # npi2 = (int)(x * twobypi + 0.5); |
| addsd %xmm5, %xmm2 # npi2 |
| |
| movsd .L__real_3ff921fb54400000(%rip), %xmm3 # piby2_1 |
| cvttpd2dq %xmm2, %xmm0 # convert to integer |
| movsd .L__real_3dd0b4611a626331(%rip), %xmm1 # piby2_1tail |
| cvtdq2pd %xmm0, %xmm2 # and back to double |
| |
| # /* Subtract the multiple from x to get an extra-precision remainder */ |
| # rhead = x - npi2 * piby2_1; |
| |
| mulsd %xmm2, %xmm3 # use piby2_1 |
| subsd %xmm3, %xmm4 # rhead |
| |
| # rtail = npi2 * piby2_1tail; |
| mulsd %xmm2, %xmm1 # rtail |
| movd %xmm0, %eax |
| |
| # GET_BITS_DP64(rhead-rtail, uy); |
| movsd %xmm4, %xmm0 |
| subsd %xmm1, %xmm0 |
| |
| movsd .L__real_3dd0b4611a600000(%rip), %xmm3 # piby2_2 |
| movsd .L__real_3ba3198a2e037073(%rip), %xmm5 # piby2_2tail |
| movd %xmm0, %rcx # rcx is rhead-rtail |
| |
| # expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64); |
| shl $1, %rcx # strip any sign bit |
| shr $53, %rcx # >> EXPSHIFTBITS_DP64 +1 |
| sub %rcx, %r9 # expdiff |
| |
| ## if (expdiff > 15) |
| cmp $15, %r9 |
| jle .Lexpdiffless15 |
| |
| # /* The remainder is pretty small compared with x, which |
| # implies that x is a near multiple of pi/2 |
| # (x matches the multiple to at least 15 bits) */ |
| |
| # t = rhead; |
| movsd %xmm4, %xmm1 |
| |
| # rtail = npi2 * piby2_2; |
| mulsd %xmm2, %xmm3 |
| |
| # rhead = t - rtail; |
| mulsd %xmm2, %xmm5 # npi2 * piby2_2tail |
| subsd %xmm3, %xmm4 # rhead |
| |
| # rtail = npi2 * piby2_2tail - ((t - rhead) - rtail); |
| subsd %xmm4, %xmm1 # t - rhead |
| subsd %xmm3, %xmm1 # -rtail |
| subsd %xmm1, %xmm5 # rtail |
| |
| # r = rhead - rtail; |
| movsd %xmm4, %xmm0 |
| |
| #HARSHA |
| #xmm1=rtail |
| movsd %xmm5, %xmm1 |
| subsd %xmm5, %xmm0 |
| |
| # xmm0=r, xmm4=rhead, xmm1=rtail |
| .Lexpdiffless15: |
| # region = npi2 & 3; |
| |
| movsd %xmm0, %xmm2 |
| mulsd %xmm0, %xmm2 #x^2 |
| movsd %xmm0, %xmm1 |
| movsd .L__real_3fe0000000000000(%rip), %xmm5 # .5 |
| |
| cmp $0x03f2, %rcx # if r small. |
| jge .L__sc_piby4c # use taylor series if not |
| cmp $0x03de, %rcx # if r really small. |
| jle .L__rc_small # then cos(r) = 1 |
| |
| ## if region is 1 or 3 do a sin calc. |
| and %eax, %r8d |
| jz .Lsinsmall |
| # region 1 or 3 |
| # use simply polynomial |
| # *s = x - x*x*x*0.166666666666666666; |
| movsd .L__real_3fc5555555555555(%rip), %xmm3 |
| mulsd %xmm1, %xmm3 # * x |
| mulsd %xmm2, %xmm3 # * x^2 |
| subsd %xmm3, %xmm1 # xs |
| jmp .L__adjust_region_cos |
| |
| .align 16 |
| .Lsinsmall: |
| # region 0 or 2 |
| # cos = 1.0 - x*x*0.5; |
| movsd .L__real_3ff0000000000000(%rip), %xmm1 # 1.0 |
| mulsd .L__real_3fe0000000000000(%rip), %xmm2 # 0.5 *x^2 |
| subsd %xmm2, %xmm1 |
| jmp .L__adjust_region_cos |
| |
| .align 16 |
| .L__rc_small: # then sin(r) = r |
| ## if region is 1 or 3 do a sin calc. |
| and %eax, %r8d |
| jnz .L__adjust_region_cos |
| movsd .L__real_3ff0000000000000(%rip), %xmm1 # cos(r) is a 1 |
| jmp .L__adjust_region_cos |
| |
| |
| # done with reducing the argument. Now perform the sin/cos calculations. |
| .align 16 |
| .L__sc_piby4c: |
| ## if region is 1 or 3 do a sin calc. |
| and %eax, %r8d |
| jz .Lcospiby4 |
| |
| movsd .Lcsarray+0x30(%rip), %xmm1 # c4 |
| movsd %xmm2, %xmm4 |
| mulsd %xmm2, %xmm1 # x2c4 |
| movsd .Lcsarray+0x10(%rip), %xmm3 # c2 |
| mulsd %xmm4, %xmm4 # x4 |
| mulsd %xmm2, %xmm3 # x2c2 |
| mulsd %xmm0, %xmm2 # x3 |
| addsd .Lcsarray+0x20(%rip), %xmm1 # c3 + x2c4 |
| mulsd %xmm4, %xmm1 # x4(c3 + x2c4) |
| addsd .Lcsarray(%rip), %xmm3 # c1 + x2c2 |
| addsd %xmm3, %xmm1 # c1 + c2x2 + c3x4 + c4x6 |
| mulsd %xmm2, %xmm1 # c1x3 + c2x5 + c3x7 + c4x9 |
| addsd %xmm0, %xmm1 # x + c1x3 + c2x5 + c3x7 + c4x9 |
| |
| jmp .L__adjust_region_cos |
| |
| .align 16 |
| .Lcospiby4: |
| # region 0 or 2 - do a cos calculation |
| movsd .Lcsarray+0x38(%rip), %xmm1 # c4 |
| movsd %xmm2, %xmm4 |
| mulsd %xmm2, %xmm1 # x2c4 |
| movsd .Lcsarray+0x18(%rip), %xmm3 # c2 |
| mulsd %xmm4, %xmm4 # x4 |
| mulsd %xmm2, %xmm3 # x2c2 |
| mulsd %xmm2, %xmm5 # 0.5 * x2 |
| addsd .Lcsarray+0x28(%rip), %xmm1 # c3 + x2c4 |
| mulsd %xmm4, %xmm1 # x4(c3 + x2c4) |
| addsd .Lcsarray+8(%rip), %xmm3 # c1 + x2c2 |
| addsd %xmm3, %xmm1 # c1 + x2c2 + c3x4 + c4x6 |
| mulsd %xmm4, %xmm1 # x4(c1 + c2x2 + c3x4 + c4x6) |
| |
| # -t = rc-1; |
| subsd .L__real_3ff0000000000000(%rip), %xmm5 # 0.5x2 - 1 |
| subsd %xmm5, %xmm1 # cos = 1 - 0.5x2 + c1x4 + c2x6 + c3x8 + c4x10 |
| |
| .L__adjust_region_cos: # xmm1 is cos or sin, relies on previous sections to |
| # switch (region) |
| add $1, %eax |
| and $2, %eax |
| jz .L__cos_cleanup |
| ## if region 1 or 2 then we negate the result. |
| xorpd %xmm2, %xmm2 |
| subsd %xmm1, %xmm2 |
| movsd %xmm2, %xmm1 |
| |
| .align 16 |
| .L__cos_cleanup: |
| cvtsd2ss %xmm1, %xmm0 |
| add $stack_size, %rsp |
| ret |
| |
| .align 16 |
| #;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; |
| .Lcosf_reduce_precise: |
| # /* Reduce abs(x) into range [-pi/4,pi/4] */ |
| # __amd_remainder_piby2(ax, &r, ®ion); |
| |
| mov %rdx,p_temp(%rsp) # save ux for use later |
| mov %r10,p_temp1(%rsp) # save ax for use later |
| movd %xmm0, %rdi |
| lea r(%rsp), %rsi |
| lea region(%rsp), %rdx |
| sub $0x020, %rsp |
| |
| call __amd_remainder_piby2d2f@PLT |
| |
| add $0x020, %rsp |
| mov p_temp(%rsp), %rdx # restore ux for use later |
| mov p_temp1(%rsp), %r10 # restore ax for use later |
| mov $1, %r8d # for determining region later on |
| movsd r(%rsp), %xmm0 # r |
| mov region(%rsp), %eax # region |
| |
| movsd %xmm0, %xmm2 |
| mulsd %xmm0, %xmm2 # x^2 |
| movsd %xmm0, %xmm1 |
| movsd .L__real_3fe0000000000000(%rip), %xmm5 # .5 |
| |
| jmp .L__sc_piby4c |
| |
| .align 32 |
| .Lcosf_naninf: |
| call fname_special |
| add $stack_size, %rsp |
| ret |