Internal change

PiperOrigin-RevId: 271275031
Change-Id: I69bce2b27644a3fff7bc445c567c8fab4a8ff234
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..baf0444
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,459 @@
+		  GNU LESSER GENERAL PUBLIC LICENSE
+		       Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+ 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL.  It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+  This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it.  You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations below.
+
+  When we speak of free software, we are referring to freedom of use,
+not price.  Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+  To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights.  These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+  For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you.  You must make sure that they, too, receive or can get the source
+code.  If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it.  And you must show them these terms so they know their rights.
+
+  We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+  To protect each distributor, we want to make it very clear that
+there is no warranty for the free library.  Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+
+  Finally, software patents pose a constant threat to the existence of
+any free program.  We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder.  Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+  Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License.  This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License.  We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+  When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library.  The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom.  The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+  We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License.  It also provides other free software developers Less
+of an advantage over competing non-free programs.  These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries.  However, the Lesser license provides advantages in certain
+special circumstances.
+
+  For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it becomes
+a de-facto standard.  To achieve this, non-free programs must be
+allowed to use the library.  A more frequent case is that a free
+library does the same job as widely used non-free libraries.  In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+  In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software.  For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+  Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.  Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library".  The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+
+		  GNU LESSER GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+  A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+  The "Library", below, refers to any such software library or work
+which has been distributed under these terms.  A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language.  (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+  "Source code" for a work means the preferred form of the work for
+making modifications to it.  For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control compilation
+and installation of the library.
+
+  Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it).  Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+
+  1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+  You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+  2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) The modified work must itself be a software library.
+
+    b) You must cause the files modified to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    c) You must cause the whole of the work to be licensed at no
+    charge to all third parties under the terms of this License.
+
+    d) If a facility in the modified Library refers to a function or a
+    table of data to be supplied by an application program that uses
+    the facility, other than as an argument passed when the facility
+    is invoked, then you must make a good faith effort to ensure that,
+    in the event an application does not supply such function or
+    table, the facility still operates, and performs whatever part of
+    its purpose remains meaningful.
+
+    (For example, a function in a library to compute square roots has
+    a purpose that is entirely well-defined independent of the
+    application.  Therefore, Subsection 2d requires that any
+    application-supplied function or table used by this function must
+    be optional: if the application does not supply it, the square
+    root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library.  To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License.  (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.)  Do not make any other change in
+these notices.
+
+  Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+  This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+  4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+  If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library".  Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+  However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library".  The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+  When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library.  The
+threshold for this to be true is not precisely defined by law.
+
+  If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work.  (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+  Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+
+  6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+  You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License.  You must supply a copy of this License.  If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License.  Also, you must do one
+of these things:
+
+    a) Accompany the work with the complete corresponding
+    machine-readable source code for the Library including whatever
+    changes were used in the work (which must be distributed under
+    Sections 1 and 2 above); and, if the work is an executable linked
+    with the Library, with the complete machine-readable "work that
+    uses the Library", as object code and/or source code, so that the
+    user can modify the Library and then relink to produce a modified
+    executable containing the modified Library.  (It is understood
+    that the user who changes the contents of definitions files in the
+    Library will not necessarily be able to recompile the application
+    to use the modified definitions.)
+
+    b) Use a suitable shared library mechanism for linking with the
+    Library.  A suitable mechanism is one that (1) uses at run time a
+    copy of the library already present on the user's computer system,
+    rather than copying library functions into the executable, and (2)
+    will operate properly with a modified version of the library, if
+    the user installs one, as long as the modified version is
+    interface-compatible with the version that the work was made with.
+
+    c) Accompany the work with a written offer, valid for at
+    least three years, to give the same user the materials
+    specified in Subsection 6a, above, for a charge no more
+    than the cost of performing this distribution.
+
+    d) If distribution of the work is made by offering access to copy
+    from a designated place, offer equivalent access to copy the above
+    specified materials from the same place.
+
+    e) Verify that the user has already received a copy of these
+    materials or that you have already sent this user a copy.
+
+  For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it.  However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+  It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system.  Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+
+  7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+    a) Accompany the combined library with a copy of the same work
+    based on the Library, uncombined with any other library
+    facilities.  This must be distributed under the terms of the
+    Sections above.
+
+    b) Give prominent notice with the combined library of the fact
+    that part of it is a work based on the Library, and explaining
+    where to find the accompanying uncombined form of the same work.
+
+  8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License.  Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License.  However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+  9. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Library or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+  10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+
+  11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under any
+particular circumstance, the balance of the section is intended to apply,
+and the section as a whole is intended to apply in other circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License may add
+an explicit geographical distribution limitation excluding those countries,
+so that distribution is permitted only in or among countries not thus
+excluded.  In such case, this License incorporates the limitation as if
+written in the body of this License.
+
+  13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation.  If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+
+  14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission.  For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this.  Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+			    NO WARRANTY
+
+  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
diff --git a/Makefile.gbase b/Makefile.gbase
new file mode 100644
index 0000000..ad03d36
--- /dev/null
+++ b/Makefile.gbase
@@ -0,0 +1,248 @@
+#
+#  Copyright (C) 2009 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+#  This file is part of libacml_mv.
+#
+#  libacml_mv is free software; you can redistribute it and/or
+#  modify it under the terms of the GNU Lesser General Public
+#  License as published by the Free Software Foundation; either
+#  version 2.1 of the License, or (at your option) any later version.
+#
+#  libacml_mv is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  Lesser General Public License for more details.
+#
+#  You should have received a copy of the GNU Lesser General Public
+#  License along with libacml_mv.  If not, see
+#  <http://www.gnu.org/licenses/>.
+#
+
+# Makefile for libacml_mv library
+
+# What we're building, and where to find it.
+LIBRARY = libacml_mv.a
+
+TARGETS = $(LIBRARY)
+
+# Makefile setup
+include $(COMMONDEFS)
+
+VPATH    = $(BUILD_BASE)/src:$(BUILD_BASE)/src/gas
+
+# Compiler options
+LCOPTS = $(STD_COMPILE_OPTS) $(STD_C_OPTS)
+LCDEFS = $(HOSTDEFS) $(TARGDEFS)
+LCINCS = -I$(BUILD_BASE)/inc
+
+# CFLAGS += -Wall -W -Wstrict-prototypes -Werror -fPIC -O2 $(DEBUG)
+
+ifeq ($(BUILD_ARCH), X8664)
+
+CFILES = \
+	acos.c \
+	acosf.c \
+	acosh.c \
+	acoshf.c \
+	asin.c \
+	asinf.c \
+	asinh.c \
+	asinhf.c \
+	atan2.c \
+	atan2f.c \
+	atan.c \
+	atanf.c \
+	atanh.c \
+	atanhf.c \
+	ceil.c \
+	ceilf.c \
+	cosh.c \
+	coshf.c \
+	exp_special.c \
+	finite.c \
+	finitef.c \
+	floor.c \
+	floorf.c \
+	frexp.c \
+	frexpf.c \
+	hypot.c \
+	hypotf.c \
+	ilogb.c \
+	ilogbf.c \
+	ldexp.c \
+	ldexpf.c \
+	libm_special.c \
+	llrint.c \
+	llrintf.c \
+	llround.c \
+	llroundf.c \
+	log1p.c \
+	log1pf.c \
+	logb.c \
+	logbf.c \
+	log_special.c \
+	lrint.c \
+	lrintf.c \
+	lround.c \
+	lroundf.c \
+	modf.c \
+	modff.c \
+	nan.c \
+	nanf.c \
+	nearbyintf.c \
+	nextafter.c \
+	nextafterf.c \
+	nexttoward.c \
+	nexttowardf.c \
+	pow_special.c \
+	remainder_piby2.c \
+	remainder_piby2d2f.c \
+	rint.c \
+	rintf.c \
+	roundf.c \
+	scalbln.c \
+	scalblnf.c \
+	scalbn.c \
+	scalbnf.c \
+	sincos_special.c \
+	sinh.c \
+	sinhf.c \
+	sqrt.c \
+	sqrtf.c \
+	tan.c \
+	tanf.c \
+	tanh.c \
+	tanhf.c
+
+ASFILES = \
+	cbrtf.S \
+	cbrt.S \
+	copysignf.S \
+	copysign.S \
+	cosf.S \
+	cos.S \
+	exp10f.S \
+	exp10.S \
+	exp2f.S \
+	exp2.S \
+	expf.S \
+	expm1f.S \
+	expm1.S \
+	exp.S \
+	fabsf.S \
+	fabs.S \
+	fdimf.S \
+	fdim.S \
+	fmaxf.S \
+	fmax.S \
+	fminf.S \
+	fmin.S \
+	fmodf.S \
+	fmod.S \
+	log10f.S \
+	log10.S \
+	log2f.S \
+	log2.S \
+	logf.S \
+	log.S \
+	nearbyint.S \
+	powf.S \
+	pow.S \
+	remainderf.S \
+	remainder.S \
+	round.S \
+	sincosf.S \
+	sincos.S \
+	sinf.S \
+	sin.S \
+	truncf.S \
+	trunc.S \
+	v4hcosl.S \
+	v4helpl.S \
+	v4hfrcpal.S \
+	v4hlog10l.S \
+	v4hlog2l.S \
+	v4hlogl.S \
+	v4hsinl.S \
+	vrd2cos.S \
+	vrd2exp.S \
+	vrd2log10.S \
+	vrd2log2.S \
+	vrd2log.S \
+	vrd2sincos.S \
+	vrd2sin.S \
+	vrd4cos.S \
+	vrd4exp.S \
+	vrd4frcpa.S \
+	vrd4log10.S \
+	vrd4log2.S \
+	vrd4log.S \
+	vrd4sin.S \
+	vrdacos.S \
+	vrdaexp.S \
+	vrdalog10.S \
+	vrdalog2.S \
+	vrdalogr.S \
+	vrdalog.S \
+	vrda_scaled_logr.S \
+	vrda_scaledshifted_logr.S \
+	vrdasincos.S \
+	vrdasin.S \
+	vrs4cosf.S \
+	vrs4expf.S \
+	vrs4log10f.S \
+	vrs4log2f.S \
+	vrs4logf.S \
+	vrs4powf.S \
+	vrs4powxf.S \
+	vrs4sincosf.S \
+	vrs4sinf.S \
+	vrs8expf.S \
+	vrs8log10f.S \
+	vrs8log2f.S \
+	vrs8logf.S \
+	vrsacosf.S \
+	vrsaexpf.S \
+	vrsalog10f.S \
+	vrsalog2f.S \
+	vrsalogf.S \
+	vrsapowf.S \
+	vrsapowxf.S \
+	vrsasincosf.S \
+	vrsasinf.S
+
+else
+
+# The special processing of the -lm option in the compiler driver should
+# be delayed until all of the options have been parsed.  Until the
+# driver is cleaned up, it is important that processing be the same on
+# all architectures.  Thus we add an empty 32-bit ACML vector math
+# library.
+
+dummy.c :
+	echo "void libacml_mv_placeholder() {}" > dummy.c
+
+CFILES = dummy.c
+LDIRT += dummy.c
+
+endif
+
+
+default:
+	$(MAKE)  first
+	$(MAKE)  $(TARGETS)
+	$(MAKE)  last
+
+first : 
+ifndef SKIP_DEP_BUILD
+	$(call submake,$(BUILD_AREA)/include)
+endif
+
+last : make_libdeps
+
+include $(COMMONRULES)
+
+$(LIBRARY): $(OBJECTS)
+	$(ar) cru $@ $^
+	$(ranlib) $@
+
diff --git a/acml_trace.cc b/acml_trace.cc
new file mode 100644
index 0000000..b5c967f
--- /dev/null
+++ b/acml_trace.cc
@@ -0,0 +1,86 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+// Author: martint@google.com (Martin Thuresson)
+
+#include "third_party/open64_libacml_mv/acml_trace.h"
+
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <functional>
+#include <string>
+
+#include "base/commandlineflags.h"
+#include "base/examine_stack.h"
+#include "base/googleinit.h"
+#include "base/init_google.h"
+#include "base/logging.h"
+#include "file/base/file.h"
+#include "file/base/helpers.h"
+#include "testing/base/public/benchmark.h"
+#include "testing/base/public/googletest.h"
+#include "third_party/absl/strings/cord.h"
+#include "third_party/open64_libacml_mv/libacml.h"
+#include "util/task/status.h"
+
+template<typename T>
+std::unique_ptr<std::vector<T>> InitTrace(
+    const char* filename,
+    std::function<T(CordReader* reader)> callback) {
+  std::unique_ptr<std::vector<T>> trace(new std::vector<T>);
+  Cord cord;
+  CHECK_OK(file::GetContents(filename, &cord, file::Defaults()));
+  CordReader reader(cord);
+
+  while (!reader.done()) {
+    trace->push_back(callback(&reader));
+  }
+
+  return trace;
+}
+
+// Read a trace file with doubles.
+std::unique_ptr<std::vector<double>> GetTraceDouble(const char *filename) {
+  std::function<double(CordReader* reader)> read_double =
+      [](CordReader* reader) {
+    double d;
+    CHECK_GE(reader->Available(), sizeof(d));
+    reader->ReadN(sizeof(d), reinterpret_cast<char*>(&d));
+    return d;
+  };
+  std::unique_ptr<std::vector<double>> trace(InitTrace<double>(filename,
+                                                               read_double));
+  return trace;
+}
+
+// Read a trace file with pairs of doubles.
+std::unique_ptr<std::vector<std::pair<double, double>>> GetTraceDoublePair(
+    const char *filename) {
+  std::function<std::pair<double, double>(CordReader* reader)> read_double =
+      [](CordReader* reader) {
+    double d[2];
+    CHECK_GE(reader->Available(), sizeof(d));
+    reader->ReadN(sizeof(d), reinterpret_cast<char*>(&d));
+    return std::make_pair(d[0], d[1]);
+  };
+  std::unique_ptr<std::vector<std::pair<double, double>>> trace(
+      InitTrace<std::pair<double, double>>(filename, read_double));
+  return trace;
+}
+
+// Read a trace file with floats.
+std::unique_ptr<std::vector<float>> GetTraceFloat(const char *filename) {
+  std::function<float(CordReader* reader)> read_float =
+      [](CordReader* reader) {
+    float f;
+    const int bytes_to_read = min(sizeof(f), reader->Available());
+    reader->ReadN(bytes_to_read, reinterpret_cast<char*>(&f));
+    return f;
+  };
+  std::unique_ptr<std::vector<float>> trace(InitTrace<float>(filename,
+                                                             read_float));
+  return trace;
+}
diff --git a/acml_trace.h b/acml_trace.h
new file mode 100644
index 0000000..65eda94
--- /dev/null
+++ b/acml_trace.h
@@ -0,0 +1,25 @@
+// Copyright 2012 and onwards Google Inc.
+// Author: martint@google.com (Martin Thuresson)
+
+#ifndef THIRD_PARTY_OPEN64_LIBACML_MV_ACML_TRACE_H__
+#define THIRD_PARTY_OPEN64_LIBACML_MV_ACML_TRACE_H__
+
+// Log files gathered from a complete run of rephil/docs. Contains the
+// arguments to all exp/log/pow call.
+#define BASE_TRACE_PATH "google3/third_party/open64_libacml_mv/testdata/"
+#define EXP_LOGFILE (BASE_TRACE_PATH "/exp.rephil_docs.builtin.baseline.trace")
+#define EXPF_LOGFILE (BASE_TRACE_PATH "/expf.fastmath_unittest.trace")
+#define LOG_LOGFILE (BASE_TRACE_PATH "/log.rephil_docs.builtin.baseline.trace")
+#define POW_LOGFILE (BASE_TRACE_PATH "/pow.rephil_docs.builtin.baseline.trace")
+
+#include <memory>
+#include <vector>
+
+std::unique_ptr<std::vector<std::pair<double, double> >> GetTraceDoublePair(
+    const char *filename);
+
+std::unique_ptr<std::vector<double>> GetTraceDouble(const char *filename);
+
+std::unique_ptr<std::vector<float>> GetTraceFloat(const char *filename);
+
+#endif  // THIRD_PARTY_OPEN64_LIBACML_MV_ACML_TRACE_H__
diff --git a/acml_trace_benchmark.cc b/acml_trace_benchmark.cc
new file mode 100644
index 0000000..fb6acc4
--- /dev/null
+++ b/acml_trace_benchmark.cc
@@ -0,0 +1,272 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+// Author: martint@google.com (Martin Thuresson)
+
+#include "third_party/open64_libacml_mv/acml_trace.h"
+
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <memory>
+#include <vector>
+
+#include "base/commandlineflags.h"
+#include "base/examine_stack.h"
+#include "base/googleinit.h"
+#include "base/init_google.h"
+#include "base/logging.h"
+#include "file/base/file.h"
+#include "file/base/path.h"
+#include "testing/base/public/benchmark.h"
+#include "testing/base/public/googletest.h"
+#include "third_party/open64_libacml_mv/libacml.h"
+
+
+int main(int argc, char** argv) {
+  InitGoogle(argv[0], &argc, &argv, true);
+  RunSpecifiedBenchmarks();
+  return 0;
+}
+
+namespace {
+
+// Local typedefs to avoid repeating complex types all over the function.
+typedef std::unique_ptr<std::vector<double>> DoubleListPtr;
+typedef std::unique_ptr<std::vector<float>> FloatListPtr;
+typedef std::unique_ptr<std::vector<std::pair<double,
+                                              double>>> DoublePairListPtr;
+
+/////////////////////////
+// Benchmark log() calls.
+/////////////////////////
+
+// Measure time spent iterating through the values.
+static void BM_math_trace_read_log(int iters) {
+  // Read trace file into memory.
+  StopBenchmarkTiming();
+  DoubleListPtr trace(GetTraceDouble(file::JoinPath(FLAGS_test_srcdir,
+                                                     LOG_LOGFILE).c_str()));
+  StartBenchmarkTiming();
+  // Process trace.
+  double d = 0.0;
+  for (int iter = 0; iter < iters; ++iter) {
+    for (auto iter = trace->begin(); iter != trace->end(); ++iter) {
+      d += *iter;
+    }
+  }
+  CHECK_NE(d, 0.0);
+}
+
+// Benchmark acml_log().
+static void BM_math_trace_acmllog(int iters) {
+  // Read trace file into memory.
+  StopBenchmarkTiming();
+  DoubleListPtr trace(GetTraceDouble(file::JoinPath(FLAGS_test_srcdir,
+                                                     LOG_LOGFILE).c_str()));
+  StartBenchmarkTiming();
+  double d = 0.0;
+  for (int iter = 0; iter < iters; ++iter) {
+    for (auto iter = trace->begin(); iter != trace->end(); ++iter) {
+      d += acml_log(*iter);
+    }
+  }
+  CHECK_NE(d, 0.0);
+}
+
+// Benchmark log().
+static void BM_math_trace_log(int iters) {
+  // Read trace file into memory.
+  StopBenchmarkTiming();
+  DoubleListPtr trace(GetTraceDouble(file::JoinPath(FLAGS_test_srcdir,
+                                                     LOG_LOGFILE).c_str()));
+  StartBenchmarkTiming();
+  double d = 0.0;
+  for (int iter = 0; iter < iters; ++iter) {
+    for (auto iter = trace->begin(); iter != trace->end(); ++iter) {
+      d += log(*iter);
+    }
+  }
+  CHECK_NE(d, 0.0);
+}
+
+
+/////////////////////////
+// Benchmark exp() calls.
+/////////////////////////
+
+// Measure time spent iterating through the values.
+static void BM_math_trace_read_exp(int iters) {
+  // Read trace file into memory.
+  StopBenchmarkTiming();
+  DoubleListPtr trace(GetTraceDouble(file::JoinPath(FLAGS_test_srcdir,
+                                                    EXP_LOGFILE).c_str()));
+  StartBenchmarkTiming();
+  double d = 0.0;
+  for (int iter = 0; iter < iters; ++iter) {
+    for (auto iter = trace->begin(); iter != trace->end(); ++iter) {
+      d += *iter;
+    }
+  }
+  CHECK_NE(d, 0.0);
+}
+
+// Benchmark acml_exp().
+static void BM_math_trace_acmlexp(int iters) {
+  // Read trace file into memory.
+  StopBenchmarkTiming();
+  DoubleListPtr trace(GetTraceDouble(file::JoinPath(FLAGS_test_srcdir,
+                                                    EXP_LOGFILE).c_str()));
+  StartBenchmarkTiming();
+  double d = 0.0;
+  for (int iter = 0; iter < iters; ++iter) {
+    for (auto iter = trace->begin(); iter != trace->end(); ++iter) {
+      d += acml_exp(*iter);
+    }
+  }
+  CHECK_NE(d, 0.0);
+}
+
+// Benchmark exp().
+static void BM_math_trace_exp(int iters) {
+  // Read trace file into memory.
+  StopBenchmarkTiming();
+  DoubleListPtr trace(GetTraceDouble(file::JoinPath(FLAGS_test_srcdir,
+                                                    EXP_LOGFILE).c_str()));
+  StartBenchmarkTiming();
+  double d = 0.0;
+  for (int iter = 0; iter < iters; ++iter) {
+    for (auto iter = trace->begin(); iter != trace->end(); ++iter) {
+      d += exp(*iter);
+    }
+  }
+  CHECK_NE(d, 0.0);
+}
+
+/////////////////////////
+// Benchmark expf() calls.
+/////////////////////////
+
+// Measure time spent iterating through the values.
+static void BM_math_trace_read_expf(int iters) {
+  // Read trace file into memory.
+  StopBenchmarkTiming();
+  FloatListPtr trace(GetTraceFloat(file::JoinPath(FLAGS_test_srcdir,
+                                                  EXPF_LOGFILE).c_str()));
+  StartBenchmarkTiming();
+  float d = 0.0;
+  for (int iter = 0; iter < iters; ++iter) {
+    for (auto iter = trace->begin(); iter != trace->end(); ++iter) {
+      d += *iter;
+    }
+  }
+  CHECK_NE(d, 0.0);
+}
+
+// Benchmark acml_exp().
+static void BM_math_trace_acmlexpf(int iters) {
+  // Read trace file into memory.
+  StopBenchmarkTiming();
+  FloatListPtr trace(GetTraceFloat(file::JoinPath(FLAGS_test_srcdir,
+                                                  EXPF_LOGFILE).c_str()));
+  StartBenchmarkTiming();
+  float d = 0.0;
+  for (int iter = 0; iter < iters; ++iter) {
+    for (auto iter = trace->begin(); iter != trace->end(); ++iter) {
+      d += acml_expf(*iter);
+    }
+  }
+  CHECK_NE(d, 0.0);
+}
+
+// Benchmark exp().
+static void BM_math_trace_expf(int iters) {
+  // Read trace file into memory.
+  StopBenchmarkTiming();
+  FloatListPtr trace(GetTraceFloat(file::JoinPath(FLAGS_test_srcdir,
+                                                  EXPF_LOGFILE).c_str()));
+  StartBenchmarkTiming();
+  float d = 0.0;
+  for (int iter = 0; iter < iters; ++iter) {
+    for (auto iter = trace->begin(); iter != trace->end(); ++iter) {
+      d += expf(*iter);
+    }
+  }
+  CHECK_NE(d, 0.0);
+}
+
+
+/////////////////////////
+// Benchmark pow() calls.
+/////////////////////////
+
+// Measure time spent iterating through the values.
+static void BM_math_trace_read_pow(int iters) {
+  // Read trace file into memory.
+  StopBenchmarkTiming();
+  DoublePairListPtr trace(GetTraceDoublePair(file::JoinPath(
+      FLAGS_test_srcdir, POW_LOGFILE).c_str()));
+  StartBenchmarkTiming();
+  double d = 0.0;
+  for (int iter = 0; iter < iters; ++iter) {
+    for (auto itr = trace->begin(); itr != trace->end(); ++itr) {
+      d += (*itr).first + (*itr).second;
+    }
+  }
+  CHECK_NE(d, 0.0);
+}
+
+// Benchmark acml_pow().
+static void BM_math_trace_acmlpow(int iters) {
+  // Read trace file into memory.
+  StopBenchmarkTiming();
+  DoublePairListPtr trace(GetTraceDoublePair(file::JoinPath(
+      FLAGS_test_srcdir, POW_LOGFILE).c_str()));
+  StartBenchmarkTiming();
+  double d = 0.0;
+  for (int iter = 0; iter < iters; ++iter) {
+    for (auto itr = trace->begin(); itr != trace->end(); ++itr) {
+      d += acml_pow((*itr).first,
+                    (*itr).second);
+    }
+  }
+  CHECK_NE(d, 0.0);
+}
+
+// Benchmark pow().
+static void BM_math_trace_pow(int iters) {
+  // Read trace file into memory.
+  StopBenchmarkTiming();
+  DoublePairListPtr trace(GetTraceDoublePair(file::JoinPath(
+      FLAGS_test_srcdir, POW_LOGFILE).c_str()));
+  StartBenchmarkTiming();
+  double d = 0.0;
+  for (int iter = 0; iter < iters; ++iter) {
+    for (auto itr = trace->begin(); itr != trace->end(); ++itr) {
+      d += pow((*itr).first,
+               (*itr).second);
+    }
+  }
+  CHECK_NE(d, 0.0);
+}
+
+
+BENCHMARK(BM_math_trace_read_exp);
+BENCHMARK(BM_math_trace_acmlexp);
+BENCHMARK(BM_math_trace_exp);
+
+BENCHMARK(BM_math_trace_read_log);
+BENCHMARK(BM_math_trace_acmllog);
+BENCHMARK(BM_math_trace_log);
+
+BENCHMARK(BM_math_trace_read_pow);
+BENCHMARK(BM_math_trace_acmlpow);
+BENCHMARK(BM_math_trace_pow);
+
+BENCHMARK(BM_math_trace_read_expf);
+BENCHMARK(BM_math_trace_acmlexpf);
+BENCHMARK(BM_math_trace_expf);
+
+}  // namespace
diff --git a/acml_trace_validate_test.cc b/acml_trace_validate_test.cc
new file mode 100644
index 0000000..9bd682c
--- /dev/null
+++ b/acml_trace_validate_test.cc
@@ -0,0 +1,114 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+// Author: martint@google.com (Martin Thuresson)
+
+#include "third_party/open64_libacml_mv/acml_trace.h"
+
+#include <math.h>
+#include <stdio.h>
+
+#include <memory>
+#include <vector>
+
+#include "base/commandlineflags.h"
+#include "base/examine_stack.h"
+#include "base/googleinit.h"
+#include "base/init_google.h"
+#include "base/logging.h"
+#include "file/base/file.h"
+#include "file/base/path.h"
+#include "testing/base/public/benchmark.h"
+#include "testing/base/public/googletest.h"
+#include "testing/base/public/gunit.h"
+#include "third_party/open64_libacml_mv/libacml.h"
+
+
+int main(int argc, char** argv) {
+  InitGoogle(argv[0], &argc, &argv, true);
+  RunSpecifiedBenchmarks();
+  return RUN_ALL_TESTS();
+}
+
+
+// Compare two doubles given a maximum unit of least precision (ULP).
+bool AlmostEqualDoubleUlps(double A, double B, int64 maxUlps) {
+  CHECK_EQ(sizeof(A), sizeof(maxUlps));
+  if (A == B)
+    return true;
+  int64 intDiff = std::abs(*(reinterpret_cast<int64*>(&A)) -
+                           *(reinterpret_cast<int64*>(&B)));
+  return intDiff <= maxUlps;
+}
+
+// Compare two floats given a maximum unit of least precision (ULP).
+bool AlmostEqualFloatUlps(float A, float B, int32 maxUlps) {
+  CHECK_EQ(sizeof(A), sizeof(maxUlps));
+  if (A == B)
+    return true;
+  int32 intDiff = abs(*(reinterpret_cast<int32*>(&A)) -
+                      *(reinterpret_cast<int32*>(&B)));
+  return intDiff <= maxUlps;
+}
+
+TEST(Case, LogTest) {
+  // Read trace file into memory.
+  std::unique_ptr<std::vector<double>> trace(
+      GetTraceDouble(file::JoinPath(FLAGS_test_srcdir,
+                                    LOG_LOGFILE).c_str()));
+  double d1;
+  double d2;
+  for (auto iter = trace->begin(); iter != trace->end(); ++iter) {
+    d1 = acml_log(*iter);
+    d2 = log(*iter);
+    // Make sure difference is at most 1 ULP.
+    EXPECT_TRUE(AlmostEqualDoubleUlps(d1, d2, 1));
+  }
+}
+
+TEST(Case, ExpTest) {
+  // Read trace file into memory.
+  std::unique_ptr<std::vector<double>> trace(
+      GetTraceDouble(file::JoinPath(FLAGS_test_srcdir,
+                                    EXP_LOGFILE).c_str()));
+  double d1;
+  double d2;
+  for (auto iter = trace->begin(); iter != trace->end(); ++iter) {
+    d1 = acml_exp(*iter);
+    d2 = exp(*iter);
+    // Make sure difference is at most 1 ULP.
+    EXPECT_TRUE(AlmostEqualDoubleUlps(d1, d2, 1));
+  }
+}
+
+
+TEST(Case, ExpfTest) {
+  // Read trace file into memory.
+  std::unique_ptr<std::vector<float>> trace(
+      GetTraceFloat(file::JoinPath(FLAGS_test_srcdir,
+                                   EXPF_LOGFILE).c_str()));
+  float f1;
+  float f2;
+  for (auto iter = trace->begin(); iter != trace->end(); ++iter) {
+    f1 = acml_expf(*iter);
+    f2 = expf(*iter);
+    // Make sure difference is at most 1 ULP.
+    EXPECT_TRUE(AlmostEqualFloatUlps(f1, f2, 1));
+  }
+}
+
+
+TEST(Case, PowTest) {
+  // Read trace file into memory.
+  std::unique_ptr<std::vector<std::pair<double, double>>> trace(
+      GetTraceDoublePair(file::JoinPath(FLAGS_test_srcdir,
+                                        POW_LOGFILE).c_str()));
+  double d1;
+  double d2;
+  for (auto iter = trace->begin(); iter != trace->end(); ++iter) {
+    d1 = acml_pow((*iter).first,
+                  (*iter).second);
+    d2 = pow((*iter).first,
+             (*iter).second);
+    // Make sure difference is at most 1 ULP.
+    EXPECT_TRUE(AlmostEqualDoubleUlps(d1, d2, 1));
+  }
+}
diff --git a/inc/acml_mv.h b/inc/acml_mv.h
new file mode 100644
index 0000000..49b7feb
--- /dev/null
+++ b/inc/acml_mv.h
@@ -0,0 +1,81 @@
+
+/*
+*  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+*
+*  This file is part of libacml_mv.
+*
+*  libacml_mv is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  libacml_mv is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with libacml_mv.  If not, see
+*  <http://www.gnu.org/licenses/>.
+*
+*/
+
+
+
+/* 
+** A header file defining the C prototypes for the fast/vector libm functions
+*/
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*
+** The scalar routines.
+*/
+double fastexp(double);
+double fastlog(double);
+double fastlog10(double);
+double fastlog2(double);
+double fastpow(double,double);
+double fastsin(double);
+double fastcos(double);
+void fastsincos(double , double *, double *);
+
+float fastexpf(float );
+float fastlogf(float );
+float fastlog10f(float );
+float fastlog2f(float );
+float fastpowf(float,float);
+float fastcosf(float );
+float fastsinf(float );
+void fastsincosf(float, float *,float *);
+
+
+/*
+** The array routines.
+*/
+void vrda_exp(int, double *, double *);
+void vrda_log(int, double *, double *);
+void vrda_log10(int, double *, double *);
+void vrda_log2(int, double *, double *);
+void vrda_sin(int, double *, double *);
+void vrda_cos(int, double *, double *);
+void vrda_sincos(int, double *, double *, double *);
+
+void vrsa_expf(int, float *, float *);
+void vrsa_logf(int, float *, float *);
+void vrsa_log10f(int, float *, float *);
+void vrsa_log2f(int, float *, float *);
+void vrsa_powf(int n, float *x, float *y, float *z);
+void vrsa_powxf(int n, float *x, float y, float *z);
+void vrsa_sinf(int, float *, float *);
+void vrsa_cosf(int, float *, float *);
+void vrsa_sincosf(int, float *, float *, float *);
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/inc/acml_mv_m128.h b/inc/acml_mv_m128.h
new file mode 100644
index 0000000..c783fe3
--- /dev/null
+++ b/inc/acml_mv_m128.h
@@ -0,0 +1,103 @@
+
+/*
+*  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+*
+*  This file is part of libacml_mv.
+*
+*  libacml_mv is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  libacml_mv is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with libacml_mv.  If not, see
+*  <http://www.gnu.org/licenses/>.
+*
+*/
+
+
+
+/* 
+** A header file defining the C prototypes for the fast/vector libm functions
+*/
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/*
+** The scalar routines.
+*/
+double fastexp(double);
+double fastlog(double);
+double fastlog10(double);
+double fastlog2(double);
+double fastpow(double,double);
+double fastsin(double);
+double fastcos(double);
+void fastsincos(double , double *, double *);
+
+float fastexpf(float );
+float fastlogf(float );
+float fastlog10f(float );
+float fastlog2f(float );
+float fastpowf(float,float);
+float fastcosf(float );
+float fastsinf(float );
+void fastsincosf(float, float *,float *);
+
+/*
+** The single vector routines.
+*/
+__m128d __vrd2_log(__m128d);
+__m128d __vrd2_exp(__m128d);
+__m128d __vrd2_log10(__m128d);
+__m128d __vrd2_log2(__m128d);
+__m128d __vrd2_sin(__m128d);
+__m128d __vrd2_cos(__m128d);
+void __vrd2_sincos(__m128d, __m128d *, __m128d *);
+
+__m128 __vrs4_expf(__m128);
+__m128 __vrs4_logf(__m128);
+__m128 __vrs4_log10f(__m128);
+__m128 __vrs4_log2f(__m128);
+__m128 __vrs4_powf(__m128,__m128);
+__m128 __vrs4_powxf(__m128 x,float y);
+__m128 __vrs4_sinf(__m128);
+__m128 __vrs4_cosf(__m128);
+void __vrs4_sincosf(__m128, __m128 *, __m128 *);
+
+
+/*
+** The array routines.
+*/
+void vrda_exp(int, double *, double *);
+void vrda_log(int, double *, double *);
+void vrda_log10(int, double *, double *);
+void vrda_log2(int, double *, double *);
+void vrda_sin(int, double *, double *);
+void vrda_cos(int, double *, double *);
+void vrda_sincos(int, double *, double *, double *);
+
+void vrsa_expf(int, float *, float *);
+void vrsa_logf(int, float *, float *);
+void vrsa_log10f(int, float *, float *);
+void vrsa_log2f(int, float *, float *);
+void vrsa_powf(int n, float *x, float *y, float *z);
+void vrsa_powxf(int n, float *x, float y, float *z);
+void vrsa_sinf(int, float *, float *);
+void vrsa_cosf(int, float *, float *);
+void vrsa_sincosf(int, float *, float *, float *);
+
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/inc/fn_macros.h b/inc/fn_macros.h
new file mode 100644
index 0000000..afc2f59
--- /dev/null
+++ b/inc/fn_macros.h
@@ -0,0 +1,47 @@
+
+/*
+*  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+*
+*  This file is part of libacml_mv.
+*
+*  libacml_mv is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  libacml_mv is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with libacml_mv.  If not, see
+*  <http://www.gnu.org/licenses/>.
+*
+*/
+
+
+#ifndef __FN_MACROS_H__
+#define __FN_MACROS_H__
+
+#if defined(WINDOWS)
+#pragma warning( disable : 4985 )
+#define FN_PROTOTYPE(fn_name) acml_impl_##fn_name
+#else
+/* For Linux we prepend function names by a double underscore */
+#define ACML_CONCAT(x,y) x##y
+/* #define FN_PROTOTYPE(fn_name) concat(__,fn_name) */
+#define FN_PROTOTYPE(fn_name) ACML_CONCAT(acml_impl_,fn_name) /* commenting out previous line for build success, !!!!! REVISIT THIS SOON !!!!! */
+#endif
+
+
+#if defined(WINDOWS)
+#define weak_alias(name, aliasname) /* as nothing */
+#else
+/* Define ALIASNAME as a weak alias for NAME.
+   If weak aliases are not available, this defines a strong alias.  */
+#define weak_alias(name, aliasname) /* _weak_alias (name, aliasname) */ /* !!!!! REVISIT THIS SOON !!!!! */
+#define _weak_alias(name, aliasname) extern __typeof (name) aliasname __attribute__ ((weak, alias (#name))); 
+#endif
+
+#endif // __FN_MACROS_H__
diff --git a/inc/libm_amd.h b/inc/libm_amd.h
new file mode 100644
index 0000000..66cd46c
--- /dev/null
+++ b/inc/libm_amd.h
@@ -0,0 +1,225 @@
+
+/*
+*  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+*
+*  This file is part of libacml_mv.
+*
+*  libacml_mv is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  libacml_mv is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with libacml_mv.  If not, see
+*  <http://www.gnu.org/licenses/>.
+*
+*/
+
+
+#ifndef LIBM_AMD_H_INCLUDED
+#define LIBM_AMD_H_INCLUDED 1
+
+#include <emmintrin.h>
+#include "acml_mv.h"
+#include "acml_mv_m128.h"
+
+#include "fn_macros.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+ double FN_PROTOTYPE(cbrt)(double x);
+ float FN_PROTOTYPE(cbrtf)(float x);
+
+ double FN_PROTOTYPE(fabs)(double x);
+ float FN_PROTOTYPE(fabsf)(float x);
+
+double FN_PROTOTYPE(acos)(double x);
+ float FN_PROTOTYPE(acosf)(float x);
+
+ double FN_PROTOTYPE(acosh)(double x);
+ float FN_PROTOTYPE(acoshf)(float x);
+
+ double FN_PROTOTYPE(asin)(double x);
+ float FN_PROTOTYPE(asinf)(float x);
+
+ double FN_PROTOTYPE( asinh)(double x);
+ float FN_PROTOTYPE(asinhf)(float x);
+
+ double FN_PROTOTYPE( atan)(double x);
+ float FN_PROTOTYPE(atanf)(float x);
+
+ double FN_PROTOTYPE( atanh)(double x);
+ float FN_PROTOTYPE(atanhf)(float x);
+
+ double FN_PROTOTYPE( atan2)(double x, double y);
+ float FN_PROTOTYPE(atan2f)(float x, float y);
+
+ double FN_PROTOTYPE( ceil)(double x);
+ float FN_PROTOTYPE(ceilf)(float x);
+
+
+ double FN_PROTOTYPE( cos)(double x);
+ float FN_PROTOTYPE(cosf)(float x);
+
+ double FN_PROTOTYPE( cosh)(double x);
+ float FN_PROTOTYPE(coshf)(float x);
+
+ double FN_PROTOTYPE( exp)(double x);
+ float FN_PROTOTYPE(expf)(float x);
+
+ double FN_PROTOTYPE( expm1)(double x);
+ float FN_PROTOTYPE(expm1f)(float x);
+
+ double FN_PROTOTYPE( exp2)(double x);
+ float FN_PROTOTYPE(exp2f)(float x);
+
+ double FN_PROTOTYPE( exp10)(double x);
+ float FN_PROTOTYPE(exp10f)(float x);
+
+
+ double FN_PROTOTYPE( fdim)(double x, double y);
+ float FN_PROTOTYPE(fdimf)(float x, float y);
+
+#ifdef WINDOWS
+ int FN_PROTOTYPE(finite)(double x);
+ int FN_PROTOTYPE(finitef)(float x);
+#else
+ int FN_PROTOTYPE(finite)(double x);
+ int FN_PROTOTYPE(finitef)(float x);
+#endif
+
+ double FN_PROTOTYPE( floor)(double x);
+ float FN_PROTOTYPE(floorf)(float x);
+
+ double FN_PROTOTYPE( fmax)(double x, double y);
+ float FN_PROTOTYPE(fmaxf)(float x, float y);
+
+ double FN_PROTOTYPE( fmin)(double x, double y);
+ float FN_PROTOTYPE(fminf)(float x, float y);
+
+ double FN_PROTOTYPE( fmod)(double x, double y);
+ float FN_PROTOTYPE(fmodf)(float x, float y);
+
+#ifdef WINDOWS
+ double FN_PROTOTYPE( hypot)(double x, double y);
+ float FN_PROTOTYPE(hypotf)(float x, float y);
+#else
+ double FN_PROTOTYPE( hypot)(double x, double y);
+ float FN_PROTOTYPE(hypotf)(float x, float y);
+#endif
+
+ float FN_PROTOTYPE(ldexpf)(float x, int exp);
+
+ double FN_PROTOTYPE(ldexp)(double x, int exp);
+
+ double FN_PROTOTYPE( log)(double x);
+ float FN_PROTOTYPE(logf)(float x);
+
+
+ float FN_PROTOTYPE(log2f)(float x);
+
+ double FN_PROTOTYPE( log10)(double x);
+ float FN_PROTOTYPE(log10f)(float x);
+
+
+ float FN_PROTOTYPE(log1pf)(float x);
+
+#ifdef WINDOWS
+ double FN_PROTOTYPE( logb)(double x);
+ float FN_PROTOTYPE(logbf)(float x);
+#else
+ double FN_PROTOTYPE( logb)(double x);
+ float FN_PROTOTYPE(logbf)(float x);
+#endif
+
+ double FN_PROTOTYPE( modf)(double x, double *iptr);
+ float FN_PROTOTYPE(modff)(float x, float *iptr);
+
+ double FN_PROTOTYPE( nextafter)(double x, double y);
+ float FN_PROTOTYPE(nextafterf)(float x, float y);
+
+ double FN_PROTOTYPE( pow)(double x, double y);
+ float FN_PROTOTYPE(powf)(float x, float y);
+
+double FN_PROTOTYPE( remainder)(double x, double y);
+ float FN_PROTOTYPE(remainderf)(float x, float y);
+
+ double FN_PROTOTYPE(sin)(double x);
+ float FN_PROTOTYPE(sinf)(float x);
+
+ void FN_PROTOTYPE(sincos)(double x, double *s, double *c);
+ void FN_PROTOTYPE(sincosf)(float x, float *s, float *c);
+
+ double FN_PROTOTYPE( sinh)(double x);
+ float FN_PROTOTYPE(sinhf)(float x);
+
+ double FN_PROTOTYPE( sqrt)(double x);
+ float FN_PROTOTYPE(sqrtf)(float x);
+
+ double FN_PROTOTYPE( tan)(double x);
+ float FN_PROTOTYPE(tanf)(float x);
+
+ double FN_PROTOTYPE( tanh)(double x);
+ float FN_PROTOTYPE(tanhf)(float x);
+
+ double FN_PROTOTYPE( trunc)(double x);
+ float FN_PROTOTYPE(truncf)(float x);
+
+ double FN_PROTOTYPE( log1p)(double x);
+ double FN_PROTOTYPE( log2)(double x);
+
+ double FN_PROTOTYPE(cosh)(double x);
+ float FN_PROTOTYPE(coshf)(float fx);
+
+ double FN_PROTOTYPE(frexp)(double value, int *exp);
+ float FN_PROTOTYPE(frexpf)(float value, int *exp);
+ int FN_PROTOTYPE(ilogb)(double x);
+ int FN_PROTOTYPE(ilogbf)(float x);
+
+ long long int FN_PROTOTYPE(llrint)(double x);
+ long long int FN_PROTOTYPE(llrintf)(float x);
+ long int FN_PROTOTYPE(lrint)(double x);
+ long int FN_PROTOTYPE(lrintf)(float x);
+ long int FN_PROTOTYPE(lround)(double d);
+ long int FN_PROTOTYPE(lroundf)(float f);
+ double  FN_PROTOTYPE(nan)(const char *tagp);
+ float  FN_PROTOTYPE(nanf)(const char *tagp);
+ float FN_PROTOTYPE(nearbyintf)(float x);
+ double FN_PROTOTYPE(nearbyint)(double x);
+ double FN_PROTOTYPE(nextafter)(double x, double y);
+ float FN_PROTOTYPE(nextafterf)(float x, float y);
+ double FN_PROTOTYPE(nexttoward)(double x, long double y);
+ float FN_PROTOTYPE(nexttowardf)(float x, long double y);
+ double FN_PROTOTYPE(rint)(double x);
+ float FN_PROTOTYPE(rintf)(float x);
+ float FN_PROTOTYPE(roundf)(float f);
+ double FN_PROTOTYPE(round)(double f);
+ double FN_PROTOTYPE(scalbln)(double x, long int n);
+ float FN_PROTOTYPE(scalblnf)(float x, long int n);
+ double FN_PROTOTYPE(scalbn)(double x, int n);
+ float FN_PROTOTYPE(scalbnf)(float x, int n);
+ long long int FN_PROTOTYPE(llroundf)(float f);
+ long long int FN_PROTOTYPE(llround)(double d);
+
+
+#ifdef WINDOWS
+ double FN_PROTOTYPE(copysign)(double x, double y);
+ float FN_PROTOTYPE(copysignf)(float x, float y);
+#else
+ double FN_PROTOTYPE(copysign)(double x, double y);
+ float FN_PROTOTYPE(copysignf)(float x, float y);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* LIBM_AMD_H_INCLUDED */
diff --git a/inc/libm_errno_amd.h b/inc/libm_errno_amd.h
new file mode 100644
index 0000000..1e6b8b9
--- /dev/null
+++ b/inc/libm_errno_amd.h
@@ -0,0 +1,33 @@
+
+/*
+*  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+*
+*  This file is part of libacml_mv.
+*
+*  libacml_mv is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  libacml_mv is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with libacml_mv.  If not, see
+*  <http://www.gnu.org/licenses/>.
+*
+*/
+
+
+#ifndef LIBM_ERRNO_AMD_H_INCLUDED
+#define LIBM_ERRNO_AMD_H_INCLUDED 1
+
+#include <stdio.h>
+#include <errno.h>
+#ifndef __set_errno
+#define __set_errno(x) errno = (x)
+#endif
+
+#endif /* LIBM_ERRNO_AMD_H_INCLUDED */
diff --git a/inc/libm_inlines_amd.h b/inc/libm_inlines_amd.h
new file mode 100644
index 0000000..a2e387a
--- /dev/null
+++ b/inc/libm_inlines_amd.h
@@ -0,0 +1,2188 @@
+
+/*
+*  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+*
+*  This file is part of libacml_mv.
+*
+*  libacml_mv is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  libacml_mv is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with libacml_mv.  If not, see
+*  <http://www.gnu.org/licenses/>.
+*
+*/
+
+
+#ifndef LIBM_INLINES_AMD_H_INCLUDED
+#define LIBM_INLINES_AMD_H_INCLUDED 1
+
+#include "libm_util_amd.h"
+#include <math.h>
+
+#ifdef WINDOWS
+#define inline __inline
+#include "emmintrin.h"
+#endif
+
+/* Compile-time verification that type long is the same size
+   as type double (i.e. we are really on a 64-bit machine) */
+void check_long_against_double_size(int machine_is_64_bit[(sizeof(long long) == sizeof(double))?1:-1]); 
+
+/* Set defines for inline functions calling other inlines */
+#if defined(USE_VAL_WITH_FLAGS) || defined(USE_VALF_WITH_FLAGS) || \
+    defined(USE_ZERO_WITH_FLAGS) || defined(USE_ZEROF_WITH_FLAGS) || \
+    defined(USE_NAN_WITH_FLAGS) || defined(USE_NANF_WITH_FLAGS) || \
+    defined(USE_INDEFINITE_WITH_FLAGS) || defined(USE_INDEFINITEF_WITH_FLAGS) || \
+    defined(USE_INFINITY_WITH_FLAGS) || defined(USE_INFINITYF_WITH_FLAGS) || \
+    defined(USE_SQRT_AMD_INLINE) || defined(USE_SQRTF_AMD_INLINE) || \
+    (defined(WINDOWS) && (defined(USE_HANDLE_ERROR) || defined(USE_HANDLE_ERRORF)))
+#undef USE_RAISE_FPSW_FLAGS
+#define USE_RAISE_FPSW_FLAGS 1
+#endif
+
+#if defined(USE_SPLITDOUBLE)
+/* Splits double x into exponent e and mantissa m, where 0.5 <= abs(m) < 1.0.
+   Assumes that x is not zero, denormal, infinity or NaN, but these conditions
+   are not checked */
+static inline void splitDouble(double x, int *e, double *m)
+{
+  unsigned long long ux, uy;
+  GET_BITS_DP64(x, ux);
+  uy = ux;
+  ux &= EXPBITS_DP64;
+  ux >>= EXPSHIFTBITS_DP64;
+  *e = (int)ux - EXPBIAS_DP64 + 1;
+  uy = (uy & (SIGNBIT_DP64 | MANTBITS_DP64)) | HALFEXPBITS_DP64;
+  PUT_BITS_DP64(uy, x);
+  *m = x;
+}
+#endif /* USE_SPLITDOUBLE */
+
+
+#if defined(USE_SPLITDOUBLE_2)
+/* Splits double x into exponent e and mantissa m, where 1.0 <= abs(m) < 4.0.
+   Assumes that x is not zero, denormal, infinity or NaN, but these conditions
+   are not checked. Also assumes EXPBIAS_DP is odd. With this
+   assumption, e will be even on exit. */
+static inline void splitDouble_2(double x, int *e, double *m)
+{
+  unsigned long long ux, vx;
+  GET_BITS_DP64(x, ux);
+  vx = ux;
+  ux &= EXPBITS_DP64;
+  ux >>= EXPSHIFTBITS_DP64;
+  if (ux & 1)
+    {
+      /* The exponent is odd */
+      vx = (vx & (SIGNBIT_DP64 | MANTBITS_DP64)) | ONEEXPBITS_DP64;
+      PUT_BITS_DP64(vx, x);
+      *m = x;
+      *e = ux - EXPBIAS_DP64;
+    }
+  else
+    {
+      /* The exponent is even */
+      vx = (vx & (SIGNBIT_DP64 | MANTBITS_DP64)) | TWOEXPBITS_DP64;
+      PUT_BITS_DP64(vx, x);
+      *m = x;
+      *e = ux - EXPBIAS_DP64 - 1;
+    }
+}
+#endif /* USE_SPLITDOUBLE_2 */
+
+
+#if defined(USE_SPLITFLOAT)
+/* Splits float x into exponent e and mantissa m, where 0.5 <= abs(m) < 1.0.
+   Assumes that x is not zero, denormal, infinity or NaN, but these conditions
+   are not checked */
+static inline void splitFloat(float x, int *e, float *m)
+{
+  unsigned int ux, uy;
+  GET_BITS_SP32(x, ux);
+  uy = ux;
+  ux &= EXPBITS_SP32;
+  ux >>= EXPSHIFTBITS_SP32;
+  *e = (int)ux - EXPBIAS_SP32 + 1;
+  uy = (uy & (SIGNBIT_SP32 | MANTBITS_SP32)) | HALFEXPBITS_SP32;
+  PUT_BITS_SP32(uy, x);
+  *m = x;
+}
+#endif /* USE_SPLITFLOAT */
+
+
+#if defined(USE_SCALEDOUBLE_1)
+/* Scales the double x by 2.0**n.
+   Assumes EMIN <= n <= EMAX, though this condition is not checked. */
+static inline double scaleDouble_1(double x, int n)
+{
+  double t;
+  /* Construct the number t = 2.0**n */
+  PUT_BITS_DP64(((long long)n + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t);
+  return x*t;
+}
+#endif /* USE_SCALEDOUBLE_1 */
+
+
+#if defined(USE_SCALEDOUBLE_2)
+/* Scales the double x by 2.0**n.
+   Assumes 2*EMIN <= n <= 2*EMAX, though this condition is not checked. */
+static inline double scaleDouble_2(double x, int n)
+{
+  double t1, t2;
+  int n1, n2;
+  n1 = n / 2;
+  n2 = n - n1;
+  /* Construct the numbers t1 = 2.0**n1 and t2 = 2.0**n2 */
+  PUT_BITS_DP64(((long long)n1 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t1);
+  PUT_BITS_DP64(((long long)n2 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t2);
+  return (x*t1)*t2;
+}
+#endif /* USE_SCALEDOUBLE_2 */
+
+
+#if defined(USE_SCALEDOUBLE_3)
+/* Scales the double x by 2.0**n.
+   Assumes 3*EMIN <= n <= 3*EMAX, though this condition is not checked. */
+static inline double scaleDouble_3(double x, int n)
+{
+  double t1, t2, t3;
+  int n1, n2, n3;
+  n1 = n / 3;
+  n2 = (n - n1) / 2;
+  n3 = n - n1 - n2;
+  /* Construct the numbers t1 = 2.0**n1, t2 = 2.0**n2 and t3 = 2.0**n3 */
+  PUT_BITS_DP64(((long long)n1 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t1);
+  PUT_BITS_DP64(((long long)n2 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t2);
+  PUT_BITS_DP64(((long long)n3 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t3);
+  return ((x*t1)*t2)*t3;
+}
+#endif /* USE_SCALEDOUBLE_3 */
+
+
+#if defined(USE_SCALEFLOAT_1)
+/* Scales the float x by 2.0**n.
+   Assumes EMIN <= n <= EMAX, though this condition is not checked. */
+static inline float scaleFloat_1(float x, int n)
+{
+  float t;
+  /* Construct the number t = 2.0**n */
+  PUT_BITS_SP32((n + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t);
+  return x*t;
+}
+#endif /* USE_SCALEFLOAT_1 */
+
+
+#if defined(USE_SCALEFLOAT_2)
+/* Scales the float x by 2.0**n.
+   Assumes 2*EMIN <= n <= 2*EMAX, though this condition is not checked. */
+static inline float scaleFloat_2(float x, int n)
+{
+  float t1, t2;
+  int n1, n2;
+  n1 = n / 2;
+  n2 = n - n1;
+  /* Construct the numbers t1 = 2.0**n1 and t2 = 2.0**n2 */
+  PUT_BITS_SP32((n1 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t1);
+  PUT_BITS_SP32((n2 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t2);
+  return (x*t1)*t2;
+}
+#endif /* USE_SCALEFLOAT_2 */
+
+
+#if defined(USE_SCALEFLOAT_3)
+/* Scales the float x by 2.0**n.
+   Assumes 3*EMIN <= n <= 3*EMAX, though this condition is not checked. */
+static inline float scaleFloat_3(float x, int n)
+{
+  float t1, t2, t3;
+  int n1, n2, n3;
+  n1 = n / 3;
+  n2 = (n - n1) / 2;
+  n3 = n - n1 - n2;
+  /* Construct the numbers t1 = 2.0**n1, t2 = 2.0**n2 and t3 = 2.0**n3 */
+  PUT_BITS_SP32((n1 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t1);
+  PUT_BITS_SP32((n2 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t2);
+  PUT_BITS_SP32((n3 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t3);
+  return ((x*t1)*t2)*t3;
+}
+#endif /* USE_SCALEFLOAT_3 */
+
+#if defined(USE_SETPRECISIONDOUBLE)
+unsigned int setPrecisionDouble(void)
+{
+  unsigned int cw, cwold = 0;
+  /* There is no precision control on Hammer */
+  return cwold;
+}
+#endif /* USE_SETPRECISIONDOUBLE */
+
+#if defined(USE_RESTOREPRECISION)
+void restorePrecision(unsigned int cwold)
+{
+#if defined(WINDOWS)
+  /* There is no precision control on Hammer */
+#elif defined(linux)
+  /* There is no precision control on Hammer */
+#else
+#error Unknown machine
+#endif
+  return;
+}
+#endif /* USE_RESTOREPRECISION */
+
+
+#if defined(USE_CLEAR_FPSW_FLAGS)
+/* Clears floating-point status flags. The argument should be
+   the bitwise or of the flags to be cleared, from the
+   list above, e.g.
+     clear_fpsw_flags(AMD_F_INEXACT | AMD_F_INVALID);
+ */
+static inline void clear_fpsw_flags(int flags)
+{
+#if defined(WINDOWS)
+  unsigned int cw = _mm_getcsr();
+  cw &= (~flags);
+  _mm_setcsr(cw);
+#elif defined(linux)
+  unsigned int cw;
+  /* Get the current floating-point control/status word */
+  asm volatile ("STMXCSR %0" : "=m" (cw));
+  cw &= (~flags);
+  asm volatile ("LDMXCSR %0" : : "m" (cw));
+#else
+#error Unknown machine
+#endif
+}
+#endif /* USE_CLEAR_FPSW_FLAGS */
+
+
+#if defined(USE_RAISE_FPSW_FLAGS)
+/* Raises floating-point status flags. The argument should be
+   the bitwise or of the flags to be raised, from the
+   list above, e.g.
+     raise_fpsw_flags(AMD_F_INEXACT | AMD_F_INVALID);
+ */
+static inline void raise_fpsw_flags(int flags)
+{
+#if defined(WINDOWS)
+  _mm_setcsr(_mm_getcsr() | flags);
+#elif defined(linux)
+  unsigned int cw;
+  /* Get the current floating-point control/status word */
+  asm volatile ("STMXCSR %0" : "=m" (cw));
+  cw |= flags;
+  asm volatile ("LDMXCSR %0" : : "m" (cw));
+#else
+#error Unknown machine
+#endif
+}
+#endif /* USE_RAISE_FPSW_FLAGS */
+
+
+#if defined(USE_GET_FPSW_INLINE)
+/* Return the current floating-point status word */
+static inline unsigned int get_fpsw_inline(void)
+{
+#if defined(WINDOWS)
+  return _mm_getcsr();
+#elif defined(linux)
+  unsigned int sw;
+  asm volatile ("STMXCSR %0" : "=m" (sw));
+  return sw;
+#else
+#error Unknown machine
+#endif
+}
+#endif /* USE_GET_FPSW_INLINE */
+
+#if defined(USE_SET_FPSW_INLINE)
+/* Set the floating-point status word */
+static inline void set_fpsw_inline(unsigned int sw)
+{
+#if defined(WINDOWS)
+  _mm_setcsr(sw);
+#elif defined(linux)
+  /* Set the current floating-point control/status word */
+  asm volatile ("LDMXCSR %0" : : "m" (sw));
+#else
+#error Unknown machine
+#endif
+}
+#endif /* USE_SET_FPSW_INLINE */
+
+#if defined(USE_CLEAR_FPSW_INLINE)
+/* Clear all exceptions from the floating-point status word */
+static inline void clear_fpsw_inline(void)
+{
+#if defined(WINDOWS)
+  unsigned int cw;
+  cw = _mm_getcsr();
+  cw &= ~(AMD_F_INEXACT | AMD_F_UNDERFLOW | AMD_F_OVERFLOW |
+          AMD_F_DIVBYZERO | AMD_F_INVALID);
+  _mm_setcsr(cw);
+#elif defined(linux)
+  unsigned int cw;
+  /* Get the current floating-point control/status word */
+  asm volatile ("STMXCSR %0" : "=m" (cw));
+  cw &= ~(AMD_F_INEXACT | AMD_F_UNDERFLOW | AMD_F_OVERFLOW |
+          AMD_F_DIVBYZERO | AMD_F_INVALID);
+  asm volatile ("LDMXCSR %0" : : "m" (cw));
+#else
+#error Unknown machine
+#endif
+}
+#endif /* USE_CLEAR_FPSW_INLINE */
+
+
+#if defined(USE_VAL_WITH_FLAGS)
+/* Returns a double value after raising the given flags,
+  e.g.  val_with_flags(x, AMD_F_INEXACT);
+ */
+static inline double val_with_flags(double val, int flags)
+{
+  raise_fpsw_flags(flags);
+  return val;
+}
+#endif /* USE_VAL_WITH_FLAGS */
+
+#if defined(USE_VALF_WITH_FLAGS)
+/* Returns a float value after raising the given flags,
+  e.g.  valf_with_flags(x, AMD_F_INEXACT);
+ */
+static inline float valf_with_flags(float val, int flags)
+{
+  raise_fpsw_flags(flags);
+  return val;
+}
+#endif /* USE_VALF_WITH_FLAGS */
+
+
+#if defined(USE_ZERO_WITH_FLAGS)
+/* Returns a double +zero after raising the given flags,
+  e.g.  zero_with_flags(AMD_F_INEXACT | AMD_F_INVALID);
+ */
+static inline double zero_with_flags(int flags)
+{
+  raise_fpsw_flags(flags);
+  return 0.0;
+}
+#endif /* USE_ZERO_WITH_FLAGS */
+
+
+#if defined(USE_ZEROF_WITH_FLAGS)
+/* Returns a float +zero after raising the given flags,
+  e.g.  zerof_with_flags(AMD_F_INEXACT | AMD_F_INVALID);
+ */
+static inline float zerof_with_flags(int flags)
+{
+  raise_fpsw_flags(flags);
+  return 0.0F;
+}
+#endif /* USE_ZEROF_WITH_FLAGS */
+
+
+#if defined(USE_NAN_WITH_FLAGS)
+/* Returns a double quiet +nan after raising the given flags,
+   e.g.  nan_with_flags(AMD_F_INVALID);
+*/
+static inline double nan_with_flags(int flags)
+{
+  double z;
+  raise_fpsw_flags(flags);
+  PUT_BITS_DP64(0x7ff8000000000000, z);
+  return z;
+}
+#endif /* USE_NAN_WITH_FLAGS */
+
+#if defined(USE_NANF_WITH_FLAGS)
+/* Returns a float quiet +nan after raising the given flags,
+   e.g.  nanf_with_flags(AMD_F_INVALID);
+*/
+static inline float nanf_with_flags(int flags)
+{
+  float z;
+  raise_fpsw_flags(flags);
+  PUT_BITS_SP32(0x7fc00000, z);
+  return z;
+}
+#endif /* USE_NANF_WITH_FLAGS */
+
+
+#if defined(USE_INDEFINITE_WITH_FLAGS)
+/* Returns a double indefinite after raising the given flags,
+   e.g.  indefinite_with_flags(AMD_F_INVALID);
+*/
+static inline double indefinite_with_flags(int flags)
+{
+  double z;
+  raise_fpsw_flags(flags);
+  PUT_BITS_DP64(0xfff8000000000000, z);
+  return z;
+}
+#endif /* USE_INDEFINITE_WITH_FLAGS */
+
+#if defined(USE_INDEFINITEF_WITH_FLAGS)
+/* Returns a float quiet +indefinite after raising the given flags,
+   e.g.  indefinitef_with_flags(AMD_F_INVALID);
+*/
+static inline float indefinitef_with_flags(int flags)
+{
+  float z;
+  raise_fpsw_flags(flags);
+  PUT_BITS_SP32(0xffc00000, z);
+  return z;
+}
+#endif /* USE_INDEFINITEF_WITH_FLAGS */
+
+
+#ifdef USE_INFINITY_WITH_FLAGS
+/* Returns a positive double infinity after raising the given flags,
+   e.g.  infinity_with_flags(AMD_F_OVERFLOW);
+*/
+static inline double infinity_with_flags(int flags)
+{
+  double z;
+  raise_fpsw_flags(flags);
+  PUT_BITS_DP64((unsigned long long)(BIASEDEMAX_DP64 + 1) << EXPSHIFTBITS_DP64, z);
+  return z;
+}
+#endif /* USE_INFINITY_WITH_FLAGS */
+
+#ifdef USE_INFINITYF_WITH_FLAGS
+/* Returns a positive float infinity after raising the given flags,
+   e.g.  infinityf_with_flags(AMD_F_OVERFLOW);
+*/
+static inline float infinityf_with_flags(int flags)
+{
+  float z;
+  raise_fpsw_flags(flags);
+  PUT_BITS_SP32((BIASEDEMAX_SP32 + 1) << EXPSHIFTBITS_SP32, z);
+  return z;
+}
+#endif /* USE_INFINITYF_WITH_FLAGS */
+
+
+#if defined(USE_SPLITEXP)
+/* Compute the values m, z1, and z2 such that base**x = 2**m * (z1 + z2).
+   Small arguments abs(x) < 1/(16*ln(base)) and extreme arguments
+   abs(x) > large/(ln(base)) (where large is the largest representable
+   floating point number) should be handled separately instead of calling
+   this function. This function is called by exp_amd, exp2_amd, exp10_amd,
+   cosh_amd and sinh_amd. */
+static inline void splitexp(double x, double logbase,
+                            double thirtytwo_by_logbaseof2,
+                            double logbaseof2_by_32_lead,
+                            double logbaseof2_by_32_trail,
+                            int *m, double *z1, double *z2)
+{
+  double q, r, r1, r2, f1, f2;
+  int n, j;
+
+/* Arrays two_to_jby32_lead_table and two_to_jby32_trail_table contain
+   leading and trailing parts respectively of precomputed
+   values of pow(2.0,j/32.0), for j = 0, 1, ..., 31.
+   two_to_jby32_lead_table contains the first 25 bits of precision,
+   and two_to_jby32_trail_table contains a further 53 bits precision. */
+
+  static const double two_to_jby32_lead_table[32] = {
+    1.00000000000000000000e+00,   /* 0x3ff0000000000000 */
+    1.02189713716506958008e+00,   /* 0x3ff059b0d0000000 */
+    1.04427373409271240234e+00,   /* 0x3ff0b55860000000 */
+    1.06714040040969848633e+00,   /* 0x3ff11301d0000000 */
+    1.09050768613815307617e+00,   /* 0x3ff172b830000000 */
+    1.11438673734664916992e+00,   /* 0x3ff1d48730000000 */
+    1.13878858089447021484e+00,   /* 0x3ff2387a60000000 */
+    1.16372483968734741211e+00,   /* 0x3ff29e9df0000000 */
+    1.18920707702636718750e+00,   /* 0x3ff306fe00000000 */
+    1.21524733304977416992e+00,   /* 0x3ff371a730000000 */
+    1.24185776710510253906e+00,   /* 0x3ff3dea640000000 */
+    1.26905095577239990234e+00,   /* 0x3ff44e0860000000 */
+    1.29683953523635864258e+00,   /* 0x3ff4bfdad0000000 */
+    1.32523661851882934570e+00,   /* 0x3ff5342b50000000 */
+    1.35425549745559692383e+00,   /* 0x3ff5ab07d0000000 */
+    1.38390988111495971680e+00,   /* 0x3ff6247eb0000000 */
+    1.41421353816986083984e+00,   /* 0x3ff6a09e60000000 */
+    1.44518077373504638672e+00,   /* 0x3ff71f75e0000000 */
+    1.47682613134384155273e+00,   /* 0x3ff7a11470000000 */
+    1.50916439294815063477e+00,   /* 0x3ff8258990000000 */
+    1.54221081733703613281e+00,   /* 0x3ff8ace540000000 */
+    1.57598084211349487305e+00,   /* 0x3ff93737b0000000 */
+    1.61049032211303710938e+00,   /* 0x3ff9c49180000000 */
+    1.64575546979904174805e+00,   /* 0x3ffa5503b0000000 */
+    1.68179279565811157227e+00,   /* 0x3ffae89f90000000 */
+    1.71861928701400756836e+00,   /* 0x3ffb7f76f0000000 */
+    1.75625211000442504883e+00,   /* 0x3ffc199bd0000000 */
+    1.79470902681350708008e+00,   /* 0x3ffcb720d0000000 */
+    1.83400803804397583008e+00,   /* 0x3ffd5818d0000000 */
+    1.87416762113571166992e+00,   /* 0x3ffdfc9730000000 */
+    1.91520655155181884766e+00,   /* 0x3ffea4afa0000000 */
+    1.95714408159255981445e+00};  /* 0x3fff507650000000 */
+
+  static const double two_to_jby32_trail_table[32] = {
+    0.00000000000000000000e+00,   /* 0x0000000000000000 */
+    1.14890470981563546737e-08,   /* 0x3e48ac2ba1d73e2a */
+    4.83347014379782142328e-08,   /* 0x3e69f3121ec53172 */
+    2.67125131841396124714e-10,   /* 0x3df25b50a4ebbf1b */
+    4.65271045830351350190e-08,   /* 0x3e68faa2f5b9bef9 */
+    5.24924336638693782574e-09,   /* 0x3e368b9aa7805b80 */
+    5.38622214388600821910e-08,   /* 0x3e6ceac470cd83f6 */
+    1.90902301017041969782e-08,   /* 0x3e547f7b84b09745 */
+    3.79763538792174980894e-08,   /* 0x3e64636e2a5bd1ab */
+    2.69306947081946450986e-08,   /* 0x3e5ceaa72a9c5154 */
+    4.49683815095311756138e-08,   /* 0x3e682468446b6824 */
+    1.41933332021066904914e-09,   /* 0x3e18624b40c4dbd0 */
+    1.94146510233556266402e-08,   /* 0x3e54d8a89c750e5e */
+    2.46409119489264118569e-08,   /* 0x3e5a753e077c2a0f */
+    4.94812958044698886494e-08,   /* 0x3e6a90a852b19260 */
+    8.48872238075784476136e-10,   /* 0x3e0d2ac258f87d03 */
+    2.42032342089579394887e-08,   /* 0x3e59fcef32422cbf */
+    3.32420002333182569170e-08,   /* 0x3e61d8bee7ba46e2 */
+    1.45956577586525322754e-08,   /* 0x3e4f580c36bea881 */
+    3.46452721050003920866e-08,   /* 0x3e62999c25159f11 */
+    8.07090469079979051284e-09,   /* 0x3e415506dadd3e2a */
+    2.99439161340839520436e-09,   /* 0x3e29b8bc9e8a0388 */
+    9.83621719880452147153e-09,   /* 0x3e451f8480e3e236 */
+    8.35492309647188080486e-09,   /* 0x3e41f12ae45a1224 */
+    3.48493175137966283582e-08,   /* 0x3e62b5a75abd0e6a */
+    1.11084703472699692902e-08,   /* 0x3e47daf237553d84 */
+    5.03688744342840346564e-08,   /* 0x3e6b0aa538444196 */
+    4.81896001063495806249e-08,   /* 0x3e69df20d22a0798 */
+    4.83653666334089557746e-08,   /* 0x3e69f7490e4bb40b */
+    1.29745882314081237628e-08,   /* 0x3e4bdcdaf5cb4656 */
+    9.84532844621636118964e-09,   /* 0x3e452486cc2c7b9d */
+    4.25828404545651943883e-08};  /* 0x3e66dc8a80ce9f09 */
+
+    /*
+      Step 1. Reduce the argument.
+
+      To perform argument reduction, we find the integer n such that
+      x = n * logbaseof2/32 + remainder, |remainder| <= logbaseof2/64.
+      n is defined by round-to-nearest-integer( x*32/logbaseof2 ) and
+      remainder by x - n*logbaseof2/32. The calculation of n is
+      straightforward whereas the computation of x - n*logbaseof2/32
+      must be carried out carefully.
+      logbaseof2/32 is so represented in two pieces that
+      (1) logbaseof2/32 is known to extra precision, (2) the product
+      of n and the leading piece is a model number and is hence
+      calculated without error, and (3) the subtraction of the value
+      obtained in (2) from x is a model number and is hence again
+      obtained without error.
+    */
+
+    r = x * thirtytwo_by_logbaseof2;
+    /* Set n = nearest integer to r */
+    /* This is faster on Hammer */
+    if (r > 0)
+      n = (int)(r + 0.5);
+    else
+      n = (int)(r - 0.5);
+
+    r1 = x - n * logbaseof2_by_32_lead;
+    r2 =   - n * logbaseof2_by_32_trail;
+
+    /* Set j = n mod 32:   5 mod 32 = 5,   -5 mod 32 = 27,  etc. */
+    /* j = n % 32;
+       if (j < 0) j += 32; */
+    j = n & 0x0000001f;
+
+    f1 = two_to_jby32_lead_table[j];
+    f2 = two_to_jby32_trail_table[j];
+
+    *m = (n - j) / 32;
+
+    /* Step 2. The following is the core approximation. We approximate
+       exp(r1+r2)-1 by a polynomial. */
+
+    r1 *= logbase; r2 *= logbase;
+
+    r = r1 + r2;
+    q = r1 + (r2 +
+              r*r*( 5.00000000000000008883e-01 +
+                      r*( 1.66666666665260878863e-01 +
+                      r*( 4.16666666662260795726e-02 +
+                      r*( 8.33336798434219616221e-03 +
+                      r*( 1.38889490863777199667e-03 ))))));
+
+    /* Step 3. Function value reconstruction.
+       We now reconstruct the exponential of the input argument
+       so that exp(x) = 2**m * (z1 + z2).
+       The order of the computation below must be strictly observed. */
+
+    *z1 = f1;
+    *z2 = f2 + ((f1 + f2) * q);
+}
+#endif /* USE_SPLITEXP */
+
+
+#if defined(USE_SPLITEXPF)
+/* Compute the values m, z1, and z2 such that base**x = 2**m * (z1 + z2).
+   Small arguments abs(x) < 1/(16*ln(base)) and extreme arguments
+   abs(x) > large/(ln(base)) (where large is the largest representable
+   floating point number) should be handled separately instead of calling
+   this function. This function is called by exp_amd, exp2_amd, exp10_amd,
+   cosh_amd and sinh_amd. */
+static inline void splitexpf(float x, float logbase,
+                             float thirtytwo_by_logbaseof2,
+                             float logbaseof2_by_32_lead,
+                             float logbaseof2_by_32_trail,
+                             int *m, float *z1, float *z2)
+{
+  float q, r, r1, r2, f1, f2;
+  int n, j;
+
+/* Arrays two_to_jby32_lead_table and two_to_jby32_trail_table contain
+   leading and trailing parts respectively of precomputed
+   values of pow(2.0,j/32.0), for j = 0, 1, ..., 31.
+   two_to_jby32_lead_table contains the first 10 bits of precision,
+   and two_to_jby32_trail_table contains a further 24 bits precision. */
+
+  static const float two_to_jby32_lead_table[32] = {
+    1.0000000000E+00F,  /* 0x3F800000 */
+    1.0214843750E+00F,  /* 0x3F82C000 */
+    1.0429687500E+00F,  /* 0x3F858000 */
+    1.0664062500E+00F,  /* 0x3F888000 */
+    1.0898437500E+00F,  /* 0x3F8B8000 */
+    1.1132812500E+00F,  /* 0x3F8E8000 */
+    1.1386718750E+00F,  /* 0x3F91C000 */
+    1.1621093750E+00F,  /* 0x3F94C000 */
+    1.1875000000E+00F,  /* 0x3F980000 */
+    1.2148437500E+00F,  /* 0x3F9B8000 */
+    1.2402343750E+00F,  /* 0x3F9EC000 */
+    1.2675781250E+00F,  /* 0x3FA24000 */
+    1.2949218750E+00F,  /* 0x3FA5C000 */
+    1.3242187500E+00F,  /* 0x3FA98000 */
+    1.3535156250E+00F,  /* 0x3FAD4000 */
+    1.3828125000E+00F,  /* 0x3FB10000 */
+    1.4140625000E+00F,  /* 0x3FB50000 */
+    1.4433593750E+00F,  /* 0x3FB8C000 */
+    1.4765625000E+00F,  /* 0x3FBD0000 */
+    1.5078125000E+00F,  /* 0x3FC10000 */
+    1.5410156250E+00F,  /* 0x3FC54000 */
+    1.5742187500E+00F,  /* 0x3FC98000 */
+    1.6093750000E+00F,  /* 0x3FCE0000 */
+    1.6445312500E+00F,  /* 0x3FD28000 */
+    1.6816406250E+00F,  /* 0x3FD74000 */
+    1.7167968750E+00F,  /* 0x3FDBC000 */
+    1.7558593750E+00F,  /* 0x3FE0C000 */
+    1.7929687500E+00F,  /* 0x3FE58000 */
+    1.8339843750E+00F,  /* 0x3FEAC000 */
+    1.8730468750E+00F,  /* 0x3FEFC000 */
+    1.9140625000E+00F,  /* 0x3FF50000 */
+    1.9570312500E+00F}; /* 0x3FFA8000 */
+
+  static const float two_to_jby32_trail_table[32] = {
+    0.0000000000E+00F,  /* 0x00000000 */
+    4.1277357377E-04F,  /* 0x39D86988 */
+    1.3050324051E-03F,  /* 0x3AAB0D9F */
+    7.3415064253E-04F,  /* 0x3A407404 */
+    6.6398258787E-04F,  /* 0x3A2E0F1E */
+    1.1054925853E-03F,  /* 0x3A90E62D */
+    1.1675967835E-04F,  /* 0x38F4DCE0 */
+    1.6154836630E-03F,  /* 0x3AD3BEA3 */
+    1.7071149778E-03F,  /* 0x3ADFC146 */
+    4.0360994171E-04F,  /* 0x39D39B9C */
+    1.6234370414E-03F,  /* 0x3AD4C982 */
+    1.4728321694E-03F,  /* 0x3AC10C0C */
+    1.9176795613E-03F,  /* 0x3AFB5AA6 */
+    1.0178930825E-03F,  /* 0x3A856AD3 */
+    7.3992193211E-04F,  /* 0x3A41F752 */
+    1.0973819299E-03F,  /* 0x3A8FD607 */
+    1.5106226783E-04F,  /* 0x391E6678 */
+    1.8214319134E-03F,  /* 0x3AEEBD1D */
+    2.6364589576E-04F,  /* 0x398A39F4 */
+    1.3519275235E-03F,  /* 0x3AB13329 */
+    1.1952003697E-03F,  /* 0x3A9CA845 */
+    1.7620950239E-03F,  /* 0x3AE6F619 */
+    1.1153318919E-03F,  /* 0x3A923054 */
+    1.2242280645E-03F,  /* 0x3AA07647 */
+    1.5220546629E-04F,  /* 0x391F9958 */
+    1.8224230735E-03F,  /* 0x3AEEDE5F */
+    3.9278529584E-04F,  /* 0x39CDEEC0 */
+    1.7403248930E-03F,  /* 0x3AE41B9D */
+    2.3711356334E-05F,  /* 0x37C6E7C0 */
+    1.1207590578E-03F,  /* 0x3A92E66F */
+    1.1440613307E-03F,  /* 0x3A95F454 */
+    1.1287408415E-04F}; /* 0x38ECB6D0 */
+
+    /*
+      Step 1. Reduce the argument.
+
+      To perform argument reduction, we find the integer n such that
+      x = n * logbaseof2/32 + remainder, |remainder| <= logbaseof2/64.
+      n is defined by round-to-nearest-integer( x*32/logbaseof2 ) and
+      remainder by x - n*logbaseof2/32. The calculation of n is
+      straightforward whereas the computation of x - n*logbaseof2/32
+      must be carried out carefully.
+      logbaseof2/32 is so represented in two pieces that
+      (1) logbaseof2/32 is known to extra precision, (2) the product
+      of n and the leading piece is a model number and is hence
+      calculated without error, and (3) the subtraction of the value
+      obtained in (2) from x is a model number and is hence again
+      obtained without error.
+    */
+
+    r = x * thirtytwo_by_logbaseof2;
+    /* Set n = nearest integer to r */
+    /* This is faster on Hammer */
+    if (r > 0)
+      n = (int)(r + 0.5F);
+    else
+      n = (int)(r - 0.5F);
+
+    r1 = x - n * logbaseof2_by_32_lead;
+    r2 =   - n * logbaseof2_by_32_trail;
+
+    /* Set j = n mod 32:   5 mod 32 = 5,   -5 mod 32 = 27,  etc. */
+    /* j = n % 32;
+       if (j < 0) j += 32; */
+    j = n & 0x0000001f;
+
+    f1 = two_to_jby32_lead_table[j];
+    f2 = two_to_jby32_trail_table[j];
+
+    *m = (n - j) / 32;
+
+    /* Step 2. The following is the core approximation. We approximate
+       exp(r1+r2)-1 by a polynomial. */
+
+    r1 *= logbase; r2 *= logbase;
+
+    r = r1 + r2;
+    q = r1 + (r2 +
+              r*r*( 5.00000000000000008883e-01F +
+                      r*( 1.66666666665260878863e-01F )));
+
+    /* Step 3. Function value reconstruction.
+       We now reconstruct the exponential of the input argument
+       so that exp(x) = 2**m * (z1 + z2).
+       The order of the computation below must be strictly observed. */
+
+    *z1 = f1;
+    *z2 = f2 + ((f1 + f2) * q);
+}
+#endif /* SPLITEXPF */
+
+
+#if defined(USE_SCALEUPDOUBLE1024)
+/* Scales up a double (normal or denormal) whose bit pattern is given
+   as ux by 2**1024. There are no checks that the input number is
+   scalable by that amount. */
+static inline void scaleUpDouble1024(unsigned long long ux, unsigned long long *ur)
+{
+  unsigned long long uy;
+  double y;
+
+  if ((ux & EXPBITS_DP64) == 0)
+    {
+      /* ux is denormalised */
+      PUT_BITS_DP64(ux | 0x4010000000000000, y);
+      if (ux & SIGNBIT_DP64)
+        y += 4.0;
+      else
+        y -= 4.0;
+      GET_BITS_DP64(y, uy);
+    }
+  else
+    /* ux is normal */
+    uy = ux + 0x4000000000000000;
+
+  *ur = uy;
+  return;
+}
+
+#endif /* SCALEUPDOUBLE1024 */
+
+
+#if defined(USE_SCALEDOWNDOUBLE)
+/* Scales down a double whose bit pattern is given as ux by 2**k.
+   There are no checks that the input number is scalable by that amount. */
+static inline void scaleDownDouble(unsigned long long ux, int k,
+                                   unsigned long long *ur)
+{
+  unsigned long long uy, uk, ax, xsign;
+  int n, shift;
+  xsign = ux & SIGNBIT_DP64;
+  ax = ux & ~SIGNBIT_DP64;
+  n = (int)((ax & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - k;
+  if (n > 0)
+    {
+      uk = (unsigned long long)n << EXPSHIFTBITS_DP64;
+      uy = (ax & ~EXPBITS_DP64) | uk;
+    }
+  else
+    {
+      uy = (ax & ~EXPBITS_DP64) | 0x0010000000000000;
+      shift = (1 - n);
+      if (shift > MANTLENGTH_DP64 + 1)
+        /* Sigh. Shifting works mod 64 so be careful not to shift too much */
+        uy = 0;
+      else
+        {
+          /* Make sure we round the result */
+          uy >>= shift - 1;
+          uy = (uy >> 1) + (uy & 1);
+        }
+    }
+  *ur = uy | xsign;
+}
+
+#endif /* SCALEDOWNDOUBLE */
+
+
+#if defined(USE_SCALEUPFLOAT128)
+/* Scales up a float (normal or denormal) whose bit pattern is given
+   as ux by 2**128. There are no checks that the input number is
+   scalable by that amount. */
+static inline void scaleUpFloat128(unsigned int ux, unsigned int *ur)
+{
+  unsigned int uy;
+  float y;
+
+  if ((ux & EXPBITS_SP32) == 0)
+    {
+      /* ux is denormalised */
+      PUT_BITS_SP32(ux | 0x40800000, y);
+      /* Compensate for the implicit bit just added */
+      if (ux & SIGNBIT_SP32)
+        y += 4.0F;
+      else
+        y -= 4.0F;
+      GET_BITS_SP32(y, uy);
+    }
+  else
+    /* ux is normal */
+    uy = ux + 0x40000000;
+  *ur = uy;
+}
+#endif /* SCALEUPFLOAT128 */
+
+
+#if defined(USE_SCALEDOWNFLOAT)
+/* Scales down a float whose bit pattern is given as ux by 2**k.
+   There are no checks that the input number is scalable by that amount. */
+static inline void scaleDownFloat(unsigned int ux, int k,
+                                  unsigned int *ur)
+{
+  unsigned int uy, uk, ax, xsign;
+  int n, shift;
+
+  xsign = ux & SIGNBIT_SP32;
+  ax = ux & ~SIGNBIT_SP32;
+  n = ((ax & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - k;
+  if (n > 0)
+    {
+      uk = (unsigned int)n << EXPSHIFTBITS_SP32;
+      uy = (ax & ~EXPBITS_SP32) | uk;
+    }
+  else
+    {
+      uy = (ax & ~EXPBITS_SP32) | 0x00800000;
+      shift = (1 - n);
+      if (shift > MANTLENGTH_SP32 + 1)
+        /* Sigh. Shifting works mod 32 so be careful not to shift too much */
+        uy = 0;
+      else
+        {
+          /* Make sure we round the result */
+          uy >>= shift - 1;
+          uy = (uy >> 1) + (uy & 1);
+        }
+    }
+  *ur = uy | xsign;
+}
+#endif /* SCALEDOWNFLOAT */
+
+
+#if defined(USE_SQRT_AMD_INLINE)
+static inline double sqrt_amd_inline(double x)
+{
+  /*
+     Computes the square root of x.
+
+     The calculation is carried out in three steps.
+
+     Step 1. Reduction.
+     The input argument is scaled to the interval [1, 4) by
+     computing
+               x = 2^e * y, where y in [1,4).
+     Furthermore y is decomposed as y = c + t where
+               c = 1 + j/32, j = 0,1,..,96; and |t| <= 1/64.
+
+     Step 2. Approximation.
+     An approximation q = sqrt(1 + (t/c)) - 1  is obtained
+     from a basic series expansion using precomputed values
+     stored in rt_jby32_lead_table_dbl and rt_jby32_trail_table_dbl.
+
+     Step 3. Reconstruction.
+     The value of sqrt(x) is reconstructed via
+       sqrt(x) = 2^(e/2) * sqrt(y)
+               = 2^(e/2) * sqrt(c) * sqrt(y/c)
+               = 2^(e/2) * sqrt(c) * sqrt(1 + t/c)
+               = 2^(e/2) * [ sqrt(c) + sqrt(c)*q ]
+    */
+
+  unsigned long long ux, ax, u;
+  double r1, r2, c, y, p, q, r, twop, z, rtc, rtc_lead, rtc_trail;
+  int e, denorm = 0, index;
+
+/* Arrays rt_jby32_lead_table_dbl and rt_jby32_trail_table_dbl contain
+   leading and trailing parts respectively of precomputed
+   values of sqrt(j/32), for j = 32, 33, ..., 128.
+   rt_jby32_lead_table_dbl contains the first 21 bits of precision,
+   and rt_jby32_trail_table_dbl contains a further 53 bits precision. */
+
+  static const double rt_jby32_lead_table_dbl[97] = {
+    1.00000000000000000000e+00,   /* 0x3ff0000000000000 */
+    1.01550388336181640625e+00,   /* 0x3ff03f8100000000 */
+    1.03077602386474609375e+00,   /* 0x3ff07e0f00000000 */
+    1.04582500457763671875e+00,   /* 0x3ff0bbb300000000 */
+    1.06065940856933593750e+00,   /* 0x3ff0f87600000000 */
+    1.07528972625732421875e+00,   /* 0x3ff1346300000000 */
+    1.08972454071044921875e+00,   /* 0x3ff16f8300000000 */
+    1.10396957397460937500e+00,   /* 0x3ff1a9dc00000000 */
+    1.11803340911865234375e+00,   /* 0x3ff1e37700000000 */
+    1.13192272186279296875e+00,   /* 0x3ff21c5b00000000 */
+    1.14564323425292968750e+00,   /* 0x3ff2548e00000000 */
+    1.15920162200927734375e+00,   /* 0x3ff28c1700000000 */
+    1.17260360717773437500e+00,   /* 0x3ff2c2fc00000000 */
+    1.18585395812988281250e+00,   /* 0x3ff2f94200000000 */
+    1.19895744323730468750e+00,   /* 0x3ff32eee00000000 */
+    1.21191978454589843750e+00,   /* 0x3ff3640600000000 */
+    1.22474479675292968750e+00,   /* 0x3ff3988e00000000 */
+    1.23743629455566406250e+00,   /* 0x3ff3cc8a00000000 */
+    1.25000000000000000000e+00,   /* 0x3ff4000000000000 */
+    1.26243782043457031250e+00,   /* 0x3ff432f200000000 */
+    1.27475452423095703125e+00,   /* 0x3ff4656500000000 */
+    1.28695297241210937500e+00,   /* 0x3ff4975c00000000 */
+    1.29903793334960937500e+00,   /* 0x3ff4c8dc00000000 */
+    1.31101036071777343750e+00,   /* 0x3ff4f9e600000000 */
+    1.32287502288818359375e+00,   /* 0x3ff52a7f00000000 */
+    1.33463478088378906250e+00,   /* 0x3ff55aaa00000000 */
+    1.34629058837890625000e+00,   /* 0x3ff58a6800000000 */
+    1.35784721374511718750e+00,   /* 0x3ff5b9be00000000 */
+    1.36930561065673828125e+00,   /* 0x3ff5e8ad00000000 */
+    1.38066959381103515625e+00,   /* 0x3ff6173900000000 */
+    1.39194107055664062500e+00,   /* 0x3ff6456400000000 */
+    1.40312099456787109375e+00,   /* 0x3ff6732f00000000 */
+    1.41421318054199218750e+00,   /* 0x3ff6a09e00000000 */
+    1.42521858215332031250e+00,   /* 0x3ff6cdb200000000 */
+    1.43614006042480468750e+00,   /* 0x3ff6fa6e00000000 */
+    1.44697952270507812500e+00,   /* 0x3ff726d400000000 */
+    1.45773792266845703125e+00,   /* 0x3ff752e500000000 */
+    1.46841716766357421875e+00,   /* 0x3ff77ea300000000 */
+    1.47901916503906250000e+00,   /* 0x3ff7aa1000000000 */
+    1.48954677581787109375e+00,   /* 0x3ff7d52f00000000 */
+    1.50000000000000000000e+00,   /* 0x3ff8000000000000 */
+    1.51038074493408203125e+00,   /* 0x3ff82a8500000000 */
+    1.52068996429443359375e+00,   /* 0x3ff854bf00000000 */
+    1.53093051910400390625e+00,   /* 0x3ff87eb100000000 */
+    1.54110336303710937500e+00,   /* 0x3ff8a85c00000000 */
+    1.55120849609375000000e+00,   /* 0x3ff8d1c000000000 */
+    1.56124877929687500000e+00,   /* 0x3ff8fae000000000 */
+    1.57122516632080078125e+00,   /* 0x3ff923bd00000000 */
+    1.58113861083984375000e+00,   /* 0x3ff94c5800000000 */
+    1.59099006652832031250e+00,   /* 0x3ff974b200000000 */
+    1.60078048706054687500e+00,   /* 0x3ff99ccc00000000 */
+    1.61051177978515625000e+00,   /* 0x3ff9c4a800000000 */
+    1.62018489837646484375e+00,   /* 0x3ff9ec4700000000 */
+    1.62979984283447265625e+00,   /* 0x3ffa13a900000000 */
+    1.63935947418212890625e+00,   /* 0x3ffa3ad100000000 */
+    1.64886283874511718750e+00,   /* 0x3ffa61be00000000 */
+    1.65831184387207031250e+00,   /* 0x3ffa887200000000 */
+    1.66770744323730468750e+00,   /* 0x3ffaaeee00000000 */
+    1.67705059051513671875e+00,   /* 0x3ffad53300000000 */
+    1.68634128570556640625e+00,   /* 0x3ffafb4100000000 */
+    1.69558238983154296875e+00,   /* 0x3ffb211b00000000 */
+    1.70477199554443359375e+00,   /* 0x3ffb46bf00000000 */
+    1.71391296386718750000e+00,   /* 0x3ffb6c3000000000 */
+    1.72300529479980468750e+00,   /* 0x3ffb916e00000000 */
+    1.73204994201660156250e+00,   /* 0x3ffbb67a00000000 */
+    1.74104785919189453125e+00,   /* 0x3ffbdb5500000000 */
+    1.75000000000000000000e+00,   /* 0x3ffc000000000000 */
+    1.75890541076660156250e+00,   /* 0x3ffc247a00000000 */
+    1.76776695251464843750e+00,   /* 0x3ffc48c600000000 */
+    1.77658367156982421875e+00,   /* 0x3ffc6ce300000000 */
+    1.78535652160644531250e+00,   /* 0x3ffc90d200000000 */
+    1.79408740997314453125e+00,   /* 0x3ffcb49500000000 */
+    1.80277538299560546875e+00,   /* 0x3ffcd82b00000000 */
+    1.81142139434814453125e+00,   /* 0x3ffcfb9500000000 */
+    1.82002735137939453125e+00,   /* 0x3ffd1ed500000000 */
+    1.82859230041503906250e+00,   /* 0x3ffd41ea00000000 */
+    1.83711719512939453125e+00,   /* 0x3ffd64d500000000 */
+    1.84560203552246093750e+00,   /* 0x3ffd879600000000 */
+    1.85404872894287109375e+00,   /* 0x3ffdaa2f00000000 */
+    1.86245727539062500000e+00,   /* 0x3ffdcca000000000 */
+    1.87082862854003906250e+00,   /* 0x3ffdeeea00000000 */
+    1.87916183471679687500e+00,   /* 0x3ffe110c00000000 */
+    1.88745784759521484375e+00,   /* 0x3ffe330700000000 */
+    1.89571857452392578125e+00,   /* 0x3ffe54dd00000000 */
+    1.90394306182861328125e+00,   /* 0x3ffe768d00000000 */
+    1.91213226318359375000e+00,   /* 0x3ffe981800000000 */
+    1.92028617858886718750e+00,   /* 0x3ffeb97e00000000 */
+    1.92840576171875000000e+00,   /* 0x3ffedac000000000 */
+    1.93649101257324218750e+00,   /* 0x3ffefbde00000000 */
+    1.94454288482666015625e+00,   /* 0x3fff1cd900000000 */
+    1.95256233215332031250e+00,   /* 0x3fff3db200000000 */
+    1.96054744720458984375e+00,   /* 0x3fff5e6700000000 */
+    1.96850109100341796875e+00,   /* 0x3fff7efb00000000 */
+    1.97642326354980468750e+00,   /* 0x3fff9f6e00000000 */
+    1.98431301116943359375e+00,   /* 0x3fffbfbf00000000 */
+    1.99217128753662109375e+00,   /* 0x3fffdfef00000000 */
+    2.00000000000000000000e+00};  /* 0x4000000000000000 */
+
+  static const double rt_jby32_trail_table_dbl[97] = {
+    0.00000000000000000000e+00,   /* 0x0000000000000000 */
+    9.17217678638807524014e-07,   /* 0x3eaec6d70177881c */
+    3.82539669043705364790e-07,   /* 0x3e99abfb41bd6b24 */
+    2.85899577162227138140e-08,   /* 0x3e5eb2bf6bab55a2 */
+    7.63210485349101216659e-07,   /* 0x3ea99bed9b2d8d0c */
+    9.32123004127716212874e-07,   /* 0x3eaf46e029c1b296 */
+    1.95174719169309219157e-07,   /* 0x3e8a3226fc42f30c */
+    5.34316371481845492427e-07,   /* 0x3ea1edbe20701d73 */
+    5.79631242504454563052e-07,   /* 0x3ea372fe94f82be7 */
+    4.20404384109571705948e-07,   /* 0x3e9c367e08e7bb06 */
+    6.89486030314147010716e-07,   /* 0x3ea722a3d0a66608 */
+    6.89927685625314560328e-07,   /* 0x3ea7266f067ca1d6 */
+    3.32778123013641425828e-07,   /* 0x3e965515a9b34850 */
+    1.64433259436999584387e-07,   /* 0x3e8611e23ef6c1bd */
+    4.37590875197899335723e-07,   /* 0x3e9d5dc1059ed8e7 */
+    1.79808183816018617413e-07,   /* 0x3e88222982d0e4f4 */
+    7.46386593615986477624e-08,   /* 0x3e7409212e7d0322 */
+    5.72520794105201454728e-07,   /* 0x3ea335ea8a5fcf39 */
+    0.00000000000000000000e+00,   /* 0x0000000000000000 */
+    2.96860689431670420344e-07,   /* 0x3e93ec071e938bfe */
+    3.54167239176257065345e-07,   /* 0x3e97c48bfd9862c6 */
+    7.95211265664474710063e-07,   /* 0x3eaaaed010f74671 */
+    1.72327048595145565621e-07,   /* 0x3e87211cbfeb62e0 */
+    6.99494915996239297020e-07,   /* 0x3ea7789d9660e72d */
+    6.32644111701500844315e-07,   /* 0x3ea53a5f1d36f1cf */
+    6.20124838851440463844e-10,   /* 0x3e054eacff2057dc */
+    6.13404719757812629969e-07,   /* 0x3ea4951b3e6a83cc */
+    3.47654909777986407387e-07,   /* 0x3e9754aa76884c66 */
+    7.83106177002392475763e-07,   /* 0x3eaa46d4b1de1074 */
+    5.33337372440526357008e-07,   /* 0x3ea1e55548f92635 */
+    2.01508648555298681765e-08,   /* 0x3e55a3070dd17788 */
+    5.25472356925843939587e-07,   /* 0x3ea1a1c5eedb0801 */
+    3.81831102861301692797e-07,   /* 0x3e999fcef32422cc */
+    6.99220602161420018738e-07,   /* 0x3ea776425d6b0199 */
+    6.01209702477462624811e-07,   /* 0x3ea42c5a1e0191a2 */
+    9.01437000591944740554e-08,   /* 0x3e7832a0bdff1327 */
+    5.10428680864685379950e-08,   /* 0x3e6b674743636676 */
+    3.47895267104621031421e-07,   /* 0x3e9758cb90d2f714 */
+    7.80735841510641848628e-07,   /* 0x3eaa3278459cde25 */
+    1.35158752025506517690e-07,   /* 0x3e822404f4a103ee */
+    0.00000000000000000000e+00,   /* 0x0000000000000000 */
+    1.76523947728535489812e-09,   /* 0x3e1e539af6892ac5 */
+    6.68280121328499932183e-07,   /* 0x3ea66c7b872c9cd0 */
+    5.70135482405123276616e-07,   /* 0x3ea3216d2f43887d */
+    1.37705134737562525897e-07,   /* 0x3e827b832cbedc0e */
+    7.09655107074516613672e-07,   /* 0x3ea7cfe41579091d */
+    7.20302724551461693011e-07,   /* 0x3ea82b5a713c490a */
+    4.69926266058212796694e-07,   /* 0x3e9f8945932d872e */
+    2.19244345915999437026e-07,   /* 0x3e8d6d2da9490251 */
+    1.91141411617401877927e-07,   /* 0x3e89a791a3114e4a */
+    5.72297665296622053774e-07,   /* 0x3ea333ffe005988d */
+    5.61055484436830560103e-07,   /* 0x3ea2d36e0ed49ab1 */
+    2.76225500213991506100e-07,   /* 0x3e92898498f55f9e */
+    7.58466189522395692908e-07,   /* 0x3ea9732cca1032a3 */
+    1.56893371256836029827e-07,   /* 0x3e850ed0b02a22d2 */
+    4.06038997708867066507e-07,   /* 0x3e9b3fb265b1e40a */
+    5.51305629612057435809e-07,   /* 0x3ea27fade682d1de */
+    5.64778487026561123207e-07,   /* 0x3ea2f36906f707ba */
+    3.92609705553556897517e-07,   /* 0x3e9a58fbbee883b6 */
+    9.09698438776943827802e-07,   /* 0x3eae864005bca6d7 */
+    1.05949774066016139743e-07,   /* 0x3e7c70d02300f263 */
+    7.16578798392844784244e-07,   /* 0x3ea80b5d712d8e3e */
+    6.86233073531233972561e-07,   /* 0x3ea706b27cc7d390 */
+    7.99211473033494452908e-07,   /* 0x3eaad12c9d849a97 */
+    8.65552275731027456121e-07,   /* 0x3ead0b09954e764b */
+    6.75456120386058448618e-07,   /* 0x3ea6aa1fb7826cbd */
+    0.00000000000000000000e+00,   /* 0x0000000000000000 */
+    4.99167184520462138743e-07,   /* 0x3ea0bfd03f46763c */
+    4.51720373502110930296e-10,   /* 0x3dff0abfb4adfb9e */
+    1.28874162718371367439e-07,   /* 0x3e814c151f991b2e */
+    5.85529267186999798656e-07,   /* 0x3ea3a5a879b09292 */
+    1.01827770937125531924e-07,   /* 0x3e7b558d173f9796 */
+    2.54736389177809626508e-07,   /* 0x3e9118567cd83fb8 */
+    6.98925535290464831294e-07,   /* 0x3ea773b981896751 */
+    1.20940735036524314513e-07,   /* 0x3e803b7df49f48a8 */
+    5.43759351196479689657e-08,   /* 0x3e6d315f22491900 */
+    1.11957989042397958409e-07,   /* 0x3e7e0db1c5bb84b2 */
+    8.47006714134442661218e-07,   /* 0x3eac6bbb7644ff76 */
+    8.92831044643427836228e-07,   /* 0x3eadf55c3afec01f */
+    7.77828292464916501663e-07,   /* 0x3eaa197e81034da3 */
+    6.48469316302918797451e-08,   /* 0x3e71683f4920555d */
+    2.12579816658859849140e-07,   /* 0x3e8c882fd78bb0b0 */
+    7.61222472580559138435e-07,   /* 0x3ea98ad9eb7b83ec */
+    2.86488961857314189607e-07,   /* 0x3e9339d7c7777273 */
+    2.14637363790165363515e-07,   /* 0x3e8ccee237cae6fe */
+    5.44137005612605847831e-08,   /* 0x3e6d368fe324a146 */
+    2.58378284856442408413e-07,   /* 0x3e9156e7b6d99b45 */
+    3.15848939061134843091e-07,   /* 0x3e95323e5310b5c1 */
+    6.60530466255089632309e-07,   /* 0x3ea629e9db362f5d */
+    7.63436345535852301127e-07,   /* 0x3ea99dde4728d7ec */
+    8.68233432860324345268e-08,   /* 0x3e774e746878544d */
+    9.45465175398023087082e-07,   /* 0x3eafb97be873a87d */
+    8.77499534786171267246e-07,   /* 0x3ead71a9e23c2f63 */
+    2.74055432394999316135e-07,   /* 0x3e92643c89cda173 */
+    4.72129009349126213532e-07,   /* 0x3e9faf1d57a4d56c */
+    8.93777032327078947306e-07,   /* 0x3eadfd7c7ab7b282 */
+    0.00000000000000000000e+00};  /* 0x0000000000000000 */
+
+
+  /* Handle special arguments first */
+
+  GET_BITS_DP64(x, ux);
+  ax = ux & (~SIGNBIT_DP64);
+
+  if(ax >= 0x7ff0000000000000)
+    {
+      /* x is either NaN or infinity */
+      if (ux & MANTBITS_DP64)
+        /* x is NaN */
+        return x + x; /* Raise invalid if it is a signalling NaN */
+      else if (ux & SIGNBIT_DP64)
+        /* x is negative infinity */
+        return nan_with_flags(AMD_F_INVALID);
+      else
+        /* x is positive infinity */
+        return x;
+    }
+  else if (ux & SIGNBIT_DP64)
+    {
+      /* x is negative. */
+      if (ux == SIGNBIT_DP64)
+        /* Handle negative zero first */
+        return x;
+      else
+        return nan_with_flags(AMD_F_INVALID);
+    }
+  else if (ux <= 0x000fffffffffffff)
+    {
+      /* x is denormalised or zero */
+      if (ux == 0)
+        /* x is zero */
+        return x;
+      else
+        {
+          /* x is denormalised; scale it up */
+          /* Normalize x by increasing the exponent by 60
+             and subtracting a correction to account for the implicit
+             bit. This replaces a slow denormalized
+             multiplication by a fast normal subtraction. */
+          static const double corr = 2.5653355008114851558350183e-290; /* 0x03d0000000000000 */
+          denorm = 1;
+          GET_BITS_DP64(x, ux);
+          PUT_BITS_DP64(ux | 0x03d0000000000000, x);
+          x -= corr;
+          GET_BITS_DP64(x, ux);
+        }
+    }
+
+  /* Main algorithm */
+
+  /*
+     Find y and e such that x = 2^e * y, where y in [1,4).
+     This is done using an in-lined variant of splitDouble,
+     which also ensures that e is even.
+   */
+  y = x;
+  ux &= EXPBITS_DP64;
+  ux >>= EXPSHIFTBITS_DP64;
+  if (ux & 1)
+    {
+      GET_BITS_DP64(y, u);
+      u &= (SIGNBIT_DP64 | MANTBITS_DP64);
+      u |= ONEEXPBITS_DP64;
+      PUT_BITS_DP64(u, y);
+      e = ux - EXPBIAS_DP64;
+    }
+  else
+    {
+      GET_BITS_DP64(y, u);
+      u &= (SIGNBIT_DP64 | MANTBITS_DP64);
+      u |= TWOEXPBITS_DP64;
+      PUT_BITS_DP64(u, y);
+      e = ux - EXPBIAS_DP64 - 1;
+    }
+
+
+  /* Find the index of the sub-interval of [1,4) in which y lies. */
+
+  index = (int)(32.0*y+0.5);
+
+  /* Look up the table values and compute c and r = c/t */
+
+  rtc_lead = rt_jby32_lead_table_dbl[index-32];
+  rtc_trail = rt_jby32_trail_table_dbl[index-32];
+  c = 0.03125*index;
+  r = (y - c)/c;
+
+  /*
+    Find q = sqrt(1+r) - 1.
+    From one step of Newton on (q+1)^2 = 1+r
+  */
+
+  p = r*0.5 - r*r*(0.1250079870 - r*(0.6250522999E-01));
+  twop = p + p;
+  q = p - (p*p + (twop - r))/(twop + 2.0);
+
+  /* Reconstruction */
+
+  rtc = rtc_lead + rtc_trail;
+  e >>= 1; /* e = e/2 */
+  z = rtc_lead + (rtc*q+rtc_trail);
+
+  if (denorm)
+    {
+      /* Scale by 2**(e-30) */
+      PUT_BITS_DP64(((long long)(e - 30) + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, r);
+      z *= r;
+    }
+  else
+    {
+      /* Scale by 2**e */
+      PUT_BITS_DP64(((long long)e + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, r);
+      z *= r;
+    }
+
+  return z;
+
+}
+#endif /* SQRT_AMD_INLINE */
+
+#if defined(USE_SQRTF_AMD_INLINE)
+
+static inline float sqrtf_amd_inline(float x)
+{
+  /*
+     Computes the square root of x.
+
+     The calculation is carried out in three steps.
+
+     Step 1. Reduction.
+     The input argument is scaled to the interval [1, 4) by
+     computing
+               x = 2^e * y, where y in [1,4).
+     Furthermore y is decomposed as y = c + t where
+               c = 1 + j/32, j = 0,1,..,96; and |t| <= 1/64.
+
+     Step 2. Approximation.
+     An approximation q = sqrt(1 + (t/c)) - 1  is obtained
+     from a basic series expansion using precomputed values
+     stored in rt_jby32_lead_table_float and rt_jby32_trail_table_float.
+
+     Step 3. Reconstruction.
+     The value of sqrt(x) is reconstructed via
+       sqrt(x) = 2^(e/2) * sqrt(y)
+               = 2^(e/2) * sqrt(c) * sqrt(y/c)
+               = 2^(e/2) * sqrt(c) * sqrt(1 + t/c)
+               = 2^(e/2) * [ sqrt(c) + sqrt(c)*q ]
+    */
+
+  unsigned int ux, ax, u;
+  float r1, r2, c, y, p, q, r, twop, z, rtc, rtc_lead, rtc_trail;
+  int e, denorm = 0, index;
+
+/* Arrays rt_jby32_lead_table_float and rt_jby32_trail_table_float contain
+   leading and trailing parts respectively of precomputed
+   values of sqrt(j/32), for j = 32, 33, ..., 128.
+   rt_jby32_lead_table_float contains the first 13 bits of precision,
+   and rt_jby32_trail_table_float contains a further 24 bits precision. */
+
+static const float rt_jby32_lead_table_float[97] = {
+    1.00000000000000000000e+00F,   /* 0x3f800000 */
+    1.01538085937500000000e+00F,   /* 0x3f81f800 */
+    1.03076171875000000000e+00F,   /* 0x3f83f000 */
+    1.04565429687500000000e+00F,   /* 0x3f85d800 */
+    1.06054687500000000000e+00F,   /* 0x3f87c000 */
+    1.07519531250000000000e+00F,   /* 0x3f89a000 */
+    1.08959960937500000000e+00F,   /* 0x3f8b7800 */
+    1.10375976562500000000e+00F,   /* 0x3f8d4800 */
+    1.11791992187500000000e+00F,   /* 0x3f8f1800 */
+    1.13183593750000000000e+00F,   /* 0x3f90e000 */
+    1.14550781250000000000e+00F,   /* 0x3f92a000 */
+    1.15917968750000000000e+00F,   /* 0x3f946000 */
+    1.17236328125000000000e+00F,   /* 0x3f961000 */
+    1.18579101562500000000e+00F,   /* 0x3f97c800 */
+    1.19873046875000000000e+00F,   /* 0x3f997000 */
+    1.21191406250000000000e+00F,   /* 0x3f9b2000 */
+    1.22460937500000000000e+00F,   /* 0x3f9cc000 */
+    1.23730468750000000000e+00F,   /* 0x3f9e6000 */
+    1.25000000000000000000e+00F,   /* 0x3fa00000 */
+    1.26220703125000000000e+00F,   /* 0x3fa19000 */
+    1.27465820312500000000e+00F,   /* 0x3fa32800 */
+    1.28686523437500000000e+00F,   /* 0x3fa4b800 */
+    1.29882812500000000000e+00F,   /* 0x3fa64000 */
+    1.31079101562500000000e+00F,   /* 0x3fa7c800 */
+    1.32275390625000000000e+00F,   /* 0x3fa95000 */
+    1.33447265625000000000e+00F,   /* 0x3faad000 */
+    1.34619140625000000000e+00F,   /* 0x3fac5000 */
+    1.35766601562500000000e+00F,   /* 0x3fadc800 */
+    1.36914062500000000000e+00F,   /* 0x3faf4000 */
+    1.38061523437500000000e+00F,   /* 0x3fb0b800 */
+    1.39184570312500000000e+00F,   /* 0x3fb22800 */
+    1.40307617187500000000e+00F,   /* 0x3fb39800 */
+    1.41406250000000000000e+00F,   /* 0x3fb50000 */
+    1.42504882812500000000e+00F,   /* 0x3fb66800 */
+    1.43603515625000000000e+00F,   /* 0x3fb7d000 */
+    1.44677734375000000000e+00F,   /* 0x3fb93000 */
+    1.45751953125000000000e+00F,   /* 0x3fba9000 */
+    1.46826171875000000000e+00F,   /* 0x3fbbf000 */
+    1.47900390625000000000e+00F,   /* 0x3fbd5000 */
+    1.48950195312500000000e+00F,   /* 0x3fbea800 */
+    1.50000000000000000000e+00F,   /* 0x3fc00000 */
+    1.51025390625000000000e+00F,   /* 0x3fc15000 */
+    1.52050781250000000000e+00F,   /* 0x3fc2a000 */
+    1.53076171875000000000e+00F,   /* 0x3fc3f000 */
+    1.54101562500000000000e+00F,   /* 0x3fc54000 */
+    1.55102539062500000000e+00F,   /* 0x3fc68800 */
+    1.56103515625000000000e+00F,   /* 0x3fc7d000 */
+    1.57104492187500000000e+00F,   /* 0x3fc91800 */
+    1.58105468750000000000e+00F,   /* 0x3fca6000 */
+    1.59082031250000000000e+00F,   /* 0x3fcba000 */
+    1.60058593750000000000e+00F,   /* 0x3fcce000 */
+    1.61035156250000000000e+00F,   /* 0x3fce2000 */
+    1.62011718750000000000e+00F,   /* 0x3fcf6000 */
+    1.62963867187500000000e+00F,   /* 0x3fd09800 */
+    1.63916015625000000000e+00F,   /* 0x3fd1d000 */
+    1.64868164062500000000e+00F,   /* 0x3fd30800 */
+    1.65820312500000000000e+00F,   /* 0x3fd44000 */
+    1.66748046875000000000e+00F,   /* 0x3fd57000 */
+    1.67700195312500000000e+00F,   /* 0x3fd6a800 */
+    1.68627929687500000000e+00F,   /* 0x3fd7d800 */
+    1.69555664062500000000e+00F,   /* 0x3fd90800 */
+    1.70458984375000000000e+00F,   /* 0x3fda3000 */
+    1.71386718750000000000e+00F,   /* 0x3fdb6000 */
+    1.72290039062500000000e+00F,   /* 0x3fdc8800 */
+    1.73193359375000000000e+00F,   /* 0x3fddb000 */
+    1.74096679687500000000e+00F,   /* 0x3fded800 */
+    1.75000000000000000000e+00F,   /* 0x3fe00000 */
+    1.75878906250000000000e+00F,   /* 0x3fe12000 */
+    1.76757812500000000000e+00F,   /* 0x3fe24000 */
+    1.77636718750000000000e+00F,   /* 0x3fe36000 */
+    1.78515625000000000000e+00F,   /* 0x3fe48000 */
+    1.79394531250000000000e+00F,   /* 0x3fe5a000 */
+    1.80273437500000000000e+00F,   /* 0x3fe6c000 */
+    1.81127929687500000000e+00F,   /* 0x3fe7d800 */
+    1.81982421875000000000e+00F,   /* 0x3fe8f000 */
+    1.82836914062500000000e+00F,   /* 0x3fea0800 */
+    1.83691406250000000000e+00F,   /* 0x3feb2000 */
+    1.84545898437500000000e+00F,   /* 0x3fec3800 */
+    1.85400390625000000000e+00F,   /* 0x3fed5000 */
+    1.86230468750000000000e+00F,   /* 0x3fee6000 */
+    1.87060546875000000000e+00F,   /* 0x3fef7000 */
+    1.87915039062500000000e+00F,   /* 0x3ff08800 */
+    1.88745117187500000000e+00F,   /* 0x3ff19800 */
+    1.89550781250000000000e+00F,   /* 0x3ff2a000 */
+    1.90380859375000000000e+00F,   /* 0x3ff3b000 */
+    1.91210937500000000000e+00F,   /* 0x3ff4c000 */
+    1.92016601562500000000e+00F,   /* 0x3ff5c800 */
+    1.92822265625000000000e+00F,   /* 0x3ff6d000 */
+    1.93627929687500000000e+00F,   /* 0x3ff7d800 */
+    1.94433593750000000000e+00F,   /* 0x3ff8e000 */
+    1.95239257812500000000e+00F,   /* 0x3ff9e800 */
+    1.96044921875000000000e+00F,   /* 0x3ffaf000 */
+    1.96826171875000000000e+00F,   /* 0x3ffbf000 */
+    1.97631835937500000000e+00F,   /* 0x3ffcf800 */
+    1.98413085937500000000e+00F,   /* 0x3ffdf800 */
+    1.99194335937500000000e+00F,   /* 0x3ffef800 */
+    2.00000000000000000000e+00F};  /* 0x40000000 */
+
+static const float rt_jby32_trail_table_float[97] = {
+    0.00000000000000000000e+00F,   /* 0x00000000 */
+    1.23941208585165441036e-04F,   /* 0x3901f637 */
+    1.46876545841223560274e-05F,   /* 0x37766aff */
+    1.70736297150142490864e-04F,   /* 0x393307ad */
+    1.13296780909877270460e-04F,   /* 0x38ed99bf */
+    9.53458802541717886925e-05F,   /* 0x38c7f46e */
+    1.25126505736261606216e-04F,   /* 0x39033464 */
+    2.10342666832730174065e-04F,   /* 0x395c8f6e */
+    1.14066875539720058441e-04F,   /* 0x38ef3730 */
+    8.72047676239162683487e-05F,   /* 0x38b6e1b4 */
+    1.36111237225122749805e-04F,   /* 0x390eb915 */
+    2.26244374061934649944e-05F,   /* 0x37bdc99c */
+    2.40658700931817293167e-04F,   /* 0x397c5954 */
+    6.31069415248930454254e-05F,   /* 0x38845848 */
+    2.27412077947519719601e-04F,   /* 0x396e7577 */
+    5.90185391047270968556e-06F,   /* 0x36c6088a */
+    1.35496389702893793583e-04F,   /* 0x390e1409 */
+    1.32179571664892137051e-04F,   /* 0x390a99af */
+    0.00000000000000000000e+00F,   /* 0x00000000 */
+    2.31086043640971183777e-04F,   /* 0x39724fb0 */
+    9.66752704698592424393e-05F,   /* 0x38cabe24 */
+    8.85332483449019491673e-05F,   /* 0x38b9aaed */
+    2.09980673389509320259e-04F,   /* 0x395c2e42 */
+    2.20044588786549866199e-04F,   /* 0x3966bbc5 */
+    1.21749282698146998882e-04F,   /* 0x38ff53a6 */
+    1.62125259521417319775e-04F,   /* 0x392a002b */
+    9.97955357888713479042e-05F,   /* 0x38d14952 */
+    1.81545779923908412457e-04F,   /* 0x393e5d53 */
+    1.65768768056295812130e-04F,   /* 0x392dd237 */
+    5.48927710042335093021e-05F,   /* 0x38663caa */
+    9.53875860432162880898e-05F,   /* 0x38c80ad2 */
+    4.53481625299900770187e-05F,   /* 0x383e3438 */
+    1.51062369695864617825e-04F,   /* 0x391e667f */
+    1.70453247847035527229e-04F,   /* 0x3932bbb2 */
+    1.05505387182347476482e-04F,   /* 0x38dd42c6 */
+    2.02269104192964732647e-04F,   /* 0x39541833 */
+    2.18442466575652360916e-04F,   /* 0x39650db4 */
+    1.55796806211583316326e-04F,   /* 0x39235d63 */
+    1.60395247803535312414e-05F,   /* 0x37868c9e */
+    4.49578510597348213196e-05F,   /* 0x383c9120 */
+    0.00000000000000000000e+00F,   /* 0x00000000 */
+    1.26840444863773882389e-04F,   /* 0x39050079 */
+    1.82820076588541269302e-04F,   /* 0x393fb364 */
+    1.69370483490638434887e-04F,   /* 0x3931990b */
+    8.78757418831810355186e-05F,   /* 0x38b849ee */
+    1.83815121999941766262e-04F,   /* 0x3940be7f */
+    2.14343352126888930798e-04F,   /* 0x3960c15b */
+    1.80714370799250900745e-04F,   /* 0x393d7e25 */
+    8.41425862745381891727e-05F,   /* 0x38b075b5 */
+    1.69945167726837098598e-04F,   /* 0x3932334f */
+    1.95121858268976211548e-04F,   /* 0x394c99a0 */
+    1.60778334247879683971e-04F,   /* 0x3928969b */
+    6.79871009197086095810e-05F,   /* 0x388e944c */
+    1.61929419846273958683e-04F,   /* 0x3929cb99 */
+    1.99474830878898501396e-04F,   /* 0x39512a1e */
+    1.81604162207804620266e-04F,   /* 0x393e6cff */
+    1.09270178654696792364e-04F,   /* 0x38e527fb */
+    2.27539261686615645885e-04F,   /* 0x396e979b */
+    4.90300008095800876617e-05F,   /* 0x384da590 */
+    6.28985289949923753738e-05F,   /* 0x3883e864 */
+    2.58551553997676819563e-05F,   /* 0x37d8e386 */
+    1.82868374395184218884e-04F,   /* 0x393fc05b */
+    4.64625991298817098141e-05F,   /* 0x3842e0d6 */
+    1.05703387816902250051e-04F,   /* 0x38ddad13 */
+    1.17213814519345760345e-04F,   /* 0x38f5d0b0 */
+    8.17377731436863541603e-05F,   /* 0x38ab6aa2 */
+    0.00000000000000000000e+00F,   /* 0x00000000 */
+    1.16847433673683553934e-04F,   /* 0x38f50bfd */
+    1.88827965757809579372e-04F,   /* 0x3946001f */
+    2.16612941585481166840e-04F,   /* 0x39632298 */
+    2.00857131858356297016e-04F,   /* 0x39529d2d */
+    1.42199307447299361229e-04F,   /* 0x39151b56 */
+    4.12627305195201188326e-05F,   /* 0x382d1185 */
+    1.42796401632949709892e-04F,   /* 0x3915bb9e */
+    2.03253570361994206905e-04F,   /* 0x39552077 */
+    2.23214170546270906925e-04F,   /* 0x396a0e99 */
+    2.03244591830298304558e-04F,   /* 0x39551e0e */
+    1.43898156238719820976e-04F,   /* 0x3916e35e */
+    4.57155256299301981926e-05F,   /* 0x383fbeac */
+    1.53365719597786664963e-04F,   /* 0x3920d0cc */
+    2.23224633373320102692e-04F,   /* 0x396a1168 */
+    1.16566716314991936088e-05F,   /* 0x37439106 */
+    7.43694272387074306607e-06F,   /* 0x36f98ada */
+    2.11048507480882108212e-04F,   /* 0x395d4ce7 */
+    1.34682719362899661064e-04F,   /* 0x390d399e */
+    2.29425968427676707506e-05F,   /* 0x37c074da */
+    1.20421340398024767637e-04F,   /* 0x38fc8ab7 */
+    1.83421318070031702518e-04F,   /* 0x394054c9 */
+    2.12376224226318299770e-04F,   /* 0x395eb14f */
+    2.07710763788782060146e-04F,   /* 0x3959ccef */
+    1.69840845046564936638e-04F,   /* 0x3932174e */
+    9.91739216260612010956e-05F,   /* 0x38cffb98 */
+    2.40249748458154499531e-04F,   /* 0x397beb8d */
+    1.05178231024183332920e-04F,   /* 0x38dc9322 */
+    1.82623916771262884140e-04F,   /* 0x393f7ebc */
+    2.28821940254420042038e-04F,   /* 0x396fefec */
+    0.00000000000000000000e+00F};  /* 0x00000000 */
+
+
+/* Handle special arguments first */
+
+  GET_BITS_SP32(x, ux);
+  ax = ux & (~SIGNBIT_SP32);
+
+  if(ax >= 0x7f800000)
+    {
+      /* x is either NaN or infinity */
+      if (ux & MANTBITS_SP32)
+        /* x is NaN */
+        return x + x; /* Raise invalid if it is a signalling NaN */
+      else if (ux & SIGNBIT_SP32)
+        return nanf_with_flags(AMD_F_INVALID);
+      else
+        /* x is positive infinity */
+        return x;
+    }
+  else if (ux & SIGNBIT_SP32)
+    {
+      /* x is negative. */
+      if (x == 0.0F)
+        /* Handle negative zero first */
+        return x;
+      else
+        return nanf_with_flags(AMD_F_INVALID);
+    }
+  else if (ux <= 0x007fffff)
+    {
+      /* x is denormalised or zero */
+      if (ux == 0)
+        /* x is zero */
+        return x;
+      else
+        {
+          /* x is denormalised; scale it up */
+          /* Normalize x by increasing the exponent by 26
+             and subtracting a correction to account for the implicit
+             bit. This replaces a slow denormalized
+             multiplication by a fast normal subtraction. */
+          static const float corr = 7.888609052210118054e-31F; /* 0x0d800000 */
+          denorm = 1;
+          GET_BITS_SP32(x, ux);
+          PUT_BITS_SP32(ux | 0x0d800000, x);
+          x -= corr;
+          GET_BITS_SP32(x, ux);
+        }
+    }
+
+  /* Main algorithm */
+
+  /*
+     Find y and e such that x = 2^e * y, where y in [1,4).
+     This is done using an in-lined variant of splitFloat,
+     which also ensures that e is even.
+   */
+  y = x;
+  ux &= EXPBITS_SP32;
+  ux >>= EXPSHIFTBITS_SP32;
+  if (ux & 1)
+    {
+      GET_BITS_SP32(y, u);
+      u &= (SIGNBIT_SP32 | MANTBITS_SP32);
+      u |= ONEEXPBITS_SP32;
+      PUT_BITS_SP32(u, y);
+      e = ux - EXPBIAS_SP32;
+    }
+  else
+    {
+      GET_BITS_SP32(y, u);
+      u &= (SIGNBIT_SP32 | MANTBITS_SP32);
+      u |= TWOEXPBITS_SP32;
+      PUT_BITS_SP32(u, y);
+      e = ux - EXPBIAS_SP32 - 1;
+    }
+
+  /* Find the index of the sub-interval of [1,4) in which y lies. */
+
+  index = (int)(32.0F*y+0.5);
+
+  /* Look up the table values and compute c and r = c/t */
+
+  rtc_lead = rt_jby32_lead_table_float[index-32];
+  rtc_trail = rt_jby32_trail_table_float[index-32];
+  c = 0.03125F*index;
+  r = (y - c)/c;
+
+  /*
+  Find q = sqrt(1+r) - 1.
+  From one step of Newton on (q+1)^2 = 1+r
+  */
+
+  p = r*0.5F - r*r*(0.1250079870F - r*(0.6250522999e-01F));
+  twop = p + p;
+  q = p - (p*p + (twop - r))/(twop + 2.0);
+
+  /* Reconstruction */
+
+  rtc = rtc_lead + rtc_trail;
+  e >>= 1; /* e = e/2 */
+  z = rtc_lead + (rtc*q+rtc_trail);
+
+  if (denorm)
+    {
+      /* Scale by 2**(e-13) */
+      PUT_BITS_SP32(((e - 13) + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, r);
+      z *= r;
+    }
+  else
+    {
+      /* Scale by 2**e */
+      PUT_BITS_SP32((e + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, r);
+      z *= r;
+    }
+
+  return z;
+
+}
+#endif /* SQRTF_AMD_INLINE */
+
+#ifdef USE_LOG_KERNEL_AMD
+static inline void log_kernel_amd64(double x, unsigned long long ux, int *xexp, double *r1, double *r2)
+{
+
+  int expadjust;
+  double r, z1, z2, correction, f, f1, f2, q, u, v, poly;
+  int index;
+
+  /*
+    Computes natural log(x). Algorithm based on:
+    Ping-Tak Peter Tang
+    "Table-driven implementation of the logarithm function in IEEE
+    floating-point arithmetic"
+    ACM Transactions on Mathematical Software (TOMS)
+    Volume 16, Issue 4 (December 1990)
+  */
+
+/* Arrays ln_lead_table and ln_tail_table contain
+   leading and trailing parts respectively of precomputed
+   values of natural log(1+i/64), for i = 0, 1, ..., 64.
+   ln_lead_table contains the first 24 bits of precision,
+   and ln_tail_table contains a further 53 bits precision. */
+
+  static const double ln_lead_table[65] = {
+    0.00000000000000000000e+00,   /* 0x0000000000000000 */
+    1.55041813850402832031e-02,   /* 0x3f8fc0a800000000 */
+    3.07716131210327148438e-02,   /* 0x3f9f829800000000 */
+    4.58095073699951171875e-02,   /* 0x3fa7745800000000 */
+    6.06245994567871093750e-02,   /* 0x3faf0a3000000000 */
+    7.52233862876892089844e-02,   /* 0x3fb341d700000000 */
+    8.96121263504028320312e-02,   /* 0x3fb6f0d200000000 */
+    1.03796780109405517578e-01,   /* 0x3fba926d00000000 */
+    1.17783010005950927734e-01,   /* 0x3fbe270700000000 */
+    1.31576299667358398438e-01,   /* 0x3fc0d77e00000000 */
+    1.45181953907012939453e-01,   /* 0x3fc2955280000000 */
+    1.58604979515075683594e-01,   /* 0x3fc44d2b00000000 */
+    1.71850204467773437500e-01,   /* 0x3fc5ff3000000000 */
+    1.84922337532043457031e-01,   /* 0x3fc7ab8900000000 */
+    1.97825729846954345703e-01,   /* 0x3fc9525a80000000 */
+    2.10564732551574707031e-01,   /* 0x3fcaf3c900000000 */
+    2.23143517971038818359e-01,   /* 0x3fcc8ff780000000 */
+    2.35566020011901855469e-01,   /* 0x3fce270700000000 */
+    2.47836112976074218750e-01,   /* 0x3fcfb91800000000 */
+    2.59957492351531982422e-01,   /* 0x3fd0a324c0000000 */
+    2.71933674812316894531e-01,   /* 0x3fd1675c80000000 */
+    2.83768117427825927734e-01,   /* 0x3fd22941c0000000 */
+    2.95464158058166503906e-01,   /* 0x3fd2e8e280000000 */
+    3.07025015354156494141e-01,   /* 0x3fd3a64c40000000 */
+    3.18453729152679443359e-01,   /* 0x3fd4618bc0000000 */
+    3.29753279685974121094e-01,   /* 0x3fd51aad80000000 */
+    3.40926527976989746094e-01,   /* 0x3fd5d1bd80000000 */
+    3.51976394653320312500e-01,   /* 0x3fd686c800000000 */
+    3.62905442714691162109e-01,   /* 0x3fd739d7c0000000 */
+    3.73716354370117187500e-01,   /* 0x3fd7eaf800000000 */
+    3.84411692619323730469e-01,   /* 0x3fd89a3380000000 */
+    3.94993782043457031250e-01,   /* 0x3fd9479400000000 */
+    4.05465066432952880859e-01,   /* 0x3fd9f323c0000000 */
+    4.15827870368957519531e-01,   /* 0x3fda9cec80000000 */
+    4.26084339618682861328e-01,   /* 0x3fdb44f740000000 */
+    4.36236739158630371094e-01,   /* 0x3fdbeb4d80000000 */
+    4.46287095546722412109e-01,   /* 0x3fdc8ff7c0000000 */
+    4.56237375736236572266e-01,   /* 0x3fdd32fe40000000 */
+    4.66089725494384765625e-01,   /* 0x3fddd46a00000000 */
+    4.75845873355865478516e-01,   /* 0x3fde744240000000 */
+    4.85507786273956298828e-01,   /* 0x3fdf128f40000000 */
+    4.95077252388000488281e-01,   /* 0x3fdfaf5880000000 */
+    5.04556000232696533203e-01,   /* 0x3fe02552a0000000 */
+    5.13945698738098144531e-01,   /* 0x3fe0723e40000000 */
+    5.23248136043548583984e-01,   /* 0x3fe0be72e0000000 */
+    5.32464742660522460938e-01,   /* 0x3fe109f380000000 */
+    5.41597247123718261719e-01,   /* 0x3fe154c3c0000000 */
+    5.50647079944610595703e-01,   /* 0x3fe19ee6a0000000 */
+    5.59615731239318847656e-01,   /* 0x3fe1e85f40000000 */
+    5.68504691123962402344e-01,   /* 0x3fe23130c0000000 */
+    5.77315330505371093750e-01,   /* 0x3fe2795e00000000 */
+    5.86049020290374755859e-01,   /* 0x3fe2c0e9e0000000 */
+    5.94707071781158447266e-01,   /* 0x3fe307d720000000 */
+    6.03290796279907226562e-01,   /* 0x3fe34e2880000000 */
+    6.11801505088806152344e-01,   /* 0x3fe393e0c0000000 */
+    6.20240390300750732422e-01,   /* 0x3fe3d90260000000 */
+    6.28608644008636474609e-01,   /* 0x3fe41d8fe0000000 */
+    6.36907458305358886719e-01,   /* 0x3fe4618bc0000000 */
+    6.45137906074523925781e-01,   /* 0x3fe4a4f840000000 */
+    6.53301239013671875000e-01,   /* 0x3fe4e7d800000000 */
+    6.61398470401763916016e-01,   /* 0x3fe52a2d20000000 */
+    6.69430613517761230469e-01,   /* 0x3fe56bf9c0000000 */
+    6.77398800849914550781e-01,   /* 0x3fe5ad4040000000 */
+    6.85303986072540283203e-01,   /* 0x3fe5ee02a0000000 */
+    6.93147122859954833984e-01};  /* 0x3fe62e42e0000000 */
+
+  static const double ln_tail_table[65] = {
+    0.00000000000000000000e+00,   /* 0x0000000000000000 */
+    5.15092497094772879206e-09,   /* 0x3e361f807c79f3db */
+    4.55457209735272790188e-08,   /* 0x3e6873c1980267c8 */
+    2.86612990859791781788e-08,   /* 0x3e5ec65b9f88c69e */
+    2.23596477332056055352e-08,   /* 0x3e58022c54cc2f99 */
+    3.49498983167142274770e-08,   /* 0x3e62c37a3a125330 */
+    3.23392843005887000414e-08,   /* 0x3e615cad69737c93 */
+    1.35722380472479366661e-08,   /* 0x3e4d256ab1b285e9 */
+    2.56504325268044191098e-08,   /* 0x3e5b8abcb97a7aa2 */
+    5.81213608741512136843e-08,   /* 0x3e6f34239659a5dc */
+    5.59374849578288093334e-08,   /* 0x3e6e07fd48d30177 */
+    5.06615629004996189970e-08,   /* 0x3e6b32df4799f4f6 */
+    5.24588857848400955725e-08,   /* 0x3e6c29e4f4f21cf8 */
+    9.61968535632653505972e-10,   /* 0x3e1086c848df1b59 */
+    1.34829655346594463137e-08,   /* 0x3e4cf456b4764130 */
+    3.65557749306383026498e-08,   /* 0x3e63a02ffcb63398 */
+    3.33431709374069198903e-08,   /* 0x3e61e6a6886b0976 */
+    5.13008650536088382197e-08,   /* 0x3e6b8abcb97a7aa2 */
+    5.09285070380306053751e-08,   /* 0x3e6b578f8aa35552 */
+    3.20853940845502057341e-08,   /* 0x3e6139c871afb9fc */
+    4.06713248643004200446e-08,   /* 0x3e65d5d30701ce64 */
+    5.57028186706125221168e-08,   /* 0x3e6de7bcb2d12142 */
+    5.48356693724804282546e-08,   /* 0x3e6d708e984e1664 */
+    1.99407553679345001938e-08,   /* 0x3e556945e9c72f36 */
+    1.96585517245087232086e-09,   /* 0x3e20e2f613e85bda */
+    6.68649386072067321503e-09,   /* 0x3e3cb7e0b42724f6 */
+    5.89936034642113390002e-08,   /* 0x3e6fac04e52846c7 */
+    2.85038578721554472484e-08,   /* 0x3e5e9b14aec442be */
+    5.09746772910284482606e-08,   /* 0x3e6b5de8034e7126 */
+    5.54234668933210171467e-08,   /* 0x3e6dc157e1b259d3 */
+    6.29100830926604004874e-09,   /* 0x3e3b05096ad69c62 */
+    2.61974119468563937716e-08,   /* 0x3e5c2116faba4cdd */
+    4.16752115011186398935e-08,   /* 0x3e665fcc25f95b47 */
+    2.47747534460820790327e-08,   /* 0x3e5a9a08498d4850 */
+    5.56922172017964209793e-08,   /* 0x3e6de647b1465f77 */
+    2.76162876992552906035e-08,   /* 0x3e5da71b7bf7861d */
+    7.08169709942321478061e-09,   /* 0x3e3e6a6886b09760 */
+    5.77453510221151779025e-08,   /* 0x3e6f0075eab0ef64 */
+    4.43021445893361960146e-09,   /* 0x3e33071282fb989b */
+    3.15140984357495864573e-08,   /* 0x3e60eb43c3f1bed2 */
+    2.95077445089736670973e-08,   /* 0x3e5faf06ecb35c84 */
+    1.44098510263167149349e-08,   /* 0x3e4ef1e63db35f68 */
+    1.05196987538551827693e-08,   /* 0x3e469743fb1a71a5 */
+    5.23641361722697546261e-08,   /* 0x3e6c1cdf404e5796 */
+    7.72099925253243069458e-09,   /* 0x3e4094aa0ada625e */
+    5.62089493829364197156e-08,   /* 0x3e6e2d4c96fde3ec */
+    3.53090261098577946927e-08,   /* 0x3e62f4d5e9a98f34 */
+    3.80080516835568242269e-08,   /* 0x3e6467c96ecc5cbe */
+    5.66961038386146408282e-08,   /* 0x3e6e7040d03dec5a */
+    4.42287063097349852717e-08,   /* 0x3e67bebf4282de36 */
+    3.45294525105681104660e-08,   /* 0x3e6289b11aeb783f */
+    2.47132034530447431509e-08,   /* 0x3e5a891d1772f538 */
+    3.59655343422487209774e-08,   /* 0x3e634f10be1fb591 */
+    5.51581770357780862071e-08,   /* 0x3e6d9ce1d316eb93 */
+    3.60171867511861372793e-08,   /* 0x3e63562a19a9c442 */
+    1.94511067964296180547e-08,   /* 0x3e54e2adf548084c */
+    1.54137376631349347838e-08,   /* 0x3e508ce55cc8c97a */
+    3.93171034490174464173e-09,   /* 0x3e30e2f613e85bda */
+    5.52990607758839766440e-08,   /* 0x3e6db03ebb0227bf */
+    3.29990737637586136511e-08,   /* 0x3e61b75bb09cb098 */
+    1.18436010922446096216e-08,   /* 0x3e496f16abb9df22 */
+    4.04248680368301346709e-08,   /* 0x3e65b3f399411c62 */
+    2.27418915900284316293e-08,   /* 0x3e586b3e59f65355 */
+    1.70263791333409206020e-08,   /* 0x3e52482ceae1ac12 */
+    5.76999904754328540596e-08};  /* 0x3e6efa39ef35793c */
+
+  /* Approximating polynomial coefficients for x near 1.0 */
+  static const double
+    ca_1 = 8.33333333333317923934e-02,  /* 0x3fb55555555554e6 */
+    ca_2 = 1.25000000037717509602e-02,  /* 0x3f89999999bac6d4 */
+    ca_3 = 2.23213998791944806202e-03,  /* 0x3f62492307f1519f */
+    ca_4 = 4.34887777707614552256e-04;  /* 0x3f3c8034c85dfff0 */
+
+  /* Approximating polynomial coefficients for other x */
+  static const double
+    cb_1 = 8.33333333333333593622e-02,  /* 0x3fb5555555555557 */
+    cb_2 = 1.24999999978138668903e-02,  /* 0x3f89999999865ede */
+    cb_3 = 2.23219810758559851206e-03;  /* 0x3f6249423bd94741 */
+
+  static const unsigned long long
+    log_thresh1 = 0x3fee0faa00000000,
+    log_thresh2 = 0x3ff1082c00000000;
+
+  /* log_thresh1 = 9.39412117004394531250e-1 = 0x3fee0faa00000000
+     log_thresh2 = 1.06449508666992187500 = 0x3ff1082c00000000 */
+  if (ux >= log_thresh1 && ux <= log_thresh2)
+    {
+      /* Arguments close to 1.0 are handled separately to maintain
+         accuracy.
+
+         The approximation in this region exploits the identity
+             log( 1 + r ) = log( 1 + u/2 )  /  log( 1 - u/2 ), where
+             u  = 2r / (2+r).
+         Note that the right hand side has an odd Taylor series expansion
+         which converges much faster than the Taylor series expansion of
+         log( 1 + r ) in r. Thus, we approximate log( 1 + r ) by
+             u + A1 * u^3 + A2 * u^5 + ... + An * u^(2n+1).
+
+         One subtlety is that since u cannot be calculated from
+         r exactly, the rounding error in the first u should be
+         avoided if possible. To accomplish this, we observe that
+                       u  =  r  -  r*r/(2+r).
+         Since x (=1+r) is the input argument, and thus presumed exact,
+         the formula above approximates u accurately because
+                       u  =  r  -  correction,
+         and the magnitude of "correction" (of the order of r*r)
+         is small.
+         With these observations, we will approximate log( 1 + r ) by
+            r + (  (A1*u^3 + ... + An*u^(2n+1)) - correction ).
+
+         We approximate log(1+r) by an odd polynomial in u, where
+                  u = 2r/(2+r) = r - r*r/(2+r).
+      */
+      r = x - 1.0;
+      u = r / (2.0 + r);
+      correction = r * u;
+      u = u + u;
+      v = u * u;
+      z1 = r;
+      z2 = (u * v * (ca_1 + v * (ca_2 + v * (ca_3 + v * ca_4))) - correction);
+      *r1 = z1;
+      *r2 = z2;
+      *xexp = 0;
+    }
+  else
+    {
+      /*
+        First, we decompose the argument x to the form
+        x  =  2**M  *  (F1  +  F2),
+        where  1 <= F1+F2 < 2, M has the value of an integer,
+        F1 = 1 + j/64, j ranges from 0 to 64, and |F2| <= 1/128.
+
+        Second, we approximate log( 1 + F2/F1 ) by an odd polynomial
+        in U, where U  =  2 F2 / (2 F2 + F1).
+        Note that log( 1 + F2/F1 ) = log( 1 + U/2 ) - log( 1 - U/2 ).
+        The core approximation calculates
+        Poly = [log( 1 + U/2 ) - log( 1 - U/2 )]/U   -   1.
+        Note that  log(1 + U/2) - log(1 - U/2) = 2 arctanh ( U/2 ),
+        thus, Poly =  2 arctanh( U/2 ) / U  -  1.
+
+        It is not hard to see that
+          log(x) = M*log(2) + log(F1) + log( 1 + F2/F1 ).
+        Hence, we return Z1 = log(F1), and  Z2 = log( 1 + F2/F1).
+        The values of log(F1) are calculated beforehand and stored
+        in the program.
+      */
+
+      f = x;
+      if (ux < IMPBIT_DP64)
+        {
+          /* The input argument x is denormalized */
+          /* Normalize f by increasing the exponent by 60
+             and subtracting a correction to account for the implicit
+             bit. This replaces a slow denormalized
+             multiplication by a fast normal subtraction. */
+          static const double corr = 2.5653355008114851558350183e-290; /* 0x03d0000000000000 */
+          GET_BITS_DP64(f, ux);
+          ux |= 0x03d0000000000000;
+          PUT_BITS_DP64(ux, f);
+          f -= corr;
+          GET_BITS_DP64(f, ux);
+          expadjust = 60;
+        }
+      else
+        expadjust = 0;
+
+      /* Store the exponent of x in xexp and put
+         f into the range [0.5,1) */
+      *xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 - expadjust;
+      PUT_BITS_DP64((ux & MANTBITS_DP64) | HALFEXPBITS_DP64, f);
+
+      /* Now  x = 2**xexp  * f,  1/2 <= f < 1. */
+
+      /* Set index to be the nearest integer to 128*f */
+      r = 128.0 * f;
+      index = (int)(r + 0.5);
+
+      z1 = ln_lead_table[index-64];
+      q = ln_tail_table[index-64];
+      f1 = index * 0.0078125; /* 0.0078125 = 1/128 */
+      f2 = f - f1;
+      /* At this point, x = 2**xexp * ( f1  +  f2 ) where
+         f1 = j/128, j = 64, 65, ..., 128 and |f2| <= 1/256. */
+
+      /* Calculate u = 2 f2 / ( 2 f1 + f2 ) = f2 / ( f1 + 0.5*f2 ) */
+      /* u = f2 / (f1 + 0.5 * f2); */
+      u = f2 / (f1 + 0.5 * f2);
+
+      /* Here, |u| <= 2(exp(1/16)-1) / (exp(1/16)+1).
+         The core approximation calculates
+         poly = [log(1 + u/2) - log(1 - u/2)]/u  -  1  */
+      v = u * u;
+      poly = (v * (cb_1 + v * (cb_2 + v * cb_3)));
+      z2 = q + (u + u * poly);
+      *r1 = z1;
+      *r2 = z2;
+    }
+  return;
+}
+#endif /* USE_LOG_KERNEL_AMD */
+
+#if defined(USE_REMAINDER_PIBY2F_INLINE)
+/* Define this to get debugging print statements activated */
+#define DEBUGGING_PRINT
+#undef DEBUGGING_PRINT
+
+
+#ifdef DEBUGGING_PRINT
+#include <stdio.h>
+char *d2b(long long d, int bitsper, int point)
+{
+  static char buff[200];
+  int i, j;
+  j = bitsper;
+  if (point >= 0 && point <= bitsper)
+    j++;
+  buff[j] = '\0';
+  for (i = bitsper - 1; i >= 0; i--)
+    {
+      j--;
+      if (d % 2 == 1)
+        buff[j] = '1';
+      else
+        buff[j] = '0';
+      if (i == point)
+        {
+          j--;
+          buff[j] = '.';
+        }
+      d /= 2;
+    }
+  return buff;
+}
+#endif
+
+/* Given positive argument x, reduce it to the range [-pi/4,pi/4] using
+   extra precision, and return the result in r.
+   Return value "region" tells how many lots of pi/2 were subtracted
+   from x to put it in the range [-pi/4,pi/4], mod 4. */
+static inline void __remainder_piby2f_inline(unsigned long long ux, double *r, int *region)
+{
+
+      /* This method simulates multi-precision floating-point
+         arithmetic and is accurate for all 1 <= x < infinity */
+#define bitsper 36
+      unsigned long long res[10];
+      unsigned long long u, carry, mask, mant, nextbits;
+      int first, last, i, rexp, xexp, resexp, ltb, determ, bc;
+      double dx;
+      static const double
+        piby2 = 1.57079632679489655800e+00; /* 0x3ff921fb54442d18 */
+#ifdef WINDOWS
+      static unsigned long long pibits[] =
+      {
+        0LL,
+        5215LL, 13000023176LL, 11362338026LL, 67174558139LL,
+        34819822259LL, 10612056195LL, 67816420731LL, 57840157550LL,
+        19558516809LL, 50025467026LL, 25186875954LL, 18152700886LL
+      };
+#else
+      static unsigned long long pibits[] =
+      {
+        0L,
+        5215L, 13000023176L, 11362338026L, 67174558139L,
+        34819822259L, 10612056195L, 67816420731L, 57840157550L,
+        19558516809L, 50025467026L, 25186875954L, 18152700886L
+      };
+#endif
+
+#ifdef DEBUGGING_PRINT
+      printf("On entry, x = %25.20e = %s\n", x, double2hex(&x));
+#endif
+
+      xexp = (int)(((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64);
+      ux = ((ux & MANTBITS_DP64) | IMPBIT_DP64) >> 29;
+
+#ifdef DEBUGGING_PRINT
+      printf("ux = %s\n", d2b(ux, 64, -1));
+#endif
+
+      /* Now ux is the mantissa bit pattern of x as a long integer */
+      mask = 1;
+      mask = (mask << bitsper) - 1;
+
+      /* Set first and last to the positions of the first
+         and last chunks of 2/pi that we need */
+      first = xexp / bitsper;
+      resexp = xexp - first * bitsper;
+      /* 120 is the theoretical maximum number of bits (actually
+         115 for IEEE single precision) that we need to extract
+         from the middle of 2/pi to compute the reduced argument
+         accurately enough for our purposes */
+      last = first + 120 / bitsper;
+
+#ifdef DEBUGGING_PRINT
+      printf("first = %d, last = %d\n", first, last);
+#endif
+
+      /* Do a long multiplication of the bits of 2/pi by the
+         integer mantissa */
+      /* Unroll the loop. This is only correct because we know
+         that bitsper is fixed as 36. */
+      res[4] = 0;
+      u = pibits[last] * ux;
+      res[3] = u & mask;
+      carry = u >> bitsper;
+      u = pibits[last - 1] * ux + carry;
+      res[2] = u & mask;
+      carry = u >> bitsper;
+      u = pibits[last - 2] * ux + carry;
+      res[1] = u & mask;
+      carry = u >> bitsper;
+      u = pibits[first] * ux + carry;
+      res[0] = u & mask;
+
+#ifdef DEBUGGING_PRINT
+      printf("resexp = %d\n", resexp);
+      printf("Significant part of x * 2/pi with binary"
+             " point in correct place:\n");
+      for (i = 0; i <= last - first; i++)
+        {
+          if (i > 0 && i % 5 == 0)
+            printf("\n ");
+          if (i == 1)
+            printf("%s ", d2b(res[i], bitsper, resexp));
+          else
+            printf("%s ", d2b(res[i], bitsper, -1));
+        }
+      printf("\n");
+#endif
+
+      /* Reconstruct the result */
+      ltb = (int)((((res[0] << bitsper) | res[1])
+                   >> (bitsper - 1 - resexp)) & 7);
+
+      /* determ says whether the fractional part is >= 0.5 */
+      determ = ltb & 1;
+
+#ifdef DEBUGGING_PRINT
+      printf("ltb = %d (last two bits before binary point"
+             " and first bit after)\n", ltb);
+      printf("determ = %d (1 means need to negate because the fractional\n"
+             "            part of x * 2/pi is greater than 0.5)\n", determ);
+#endif
+
+      i = 1;
+      if (determ)
+        {
+          /* The mantissa is >= 0.5. We want to subtract it
+             from 1.0 by negating all the bits */
+          *region = ((ltb >> 1) + 1) & 3;
+          mant = 1;
+          mant = ~(res[1]) & ((mant << (bitsper - resexp)) - 1);
+          while (mant < 0x0000000000010000)
+            {
+              i++;
+              mant = (mant << bitsper) | (~(res[i]) & mask);
+            }
+          nextbits = (~(res[i+1]) & mask);
+        }
+      else
+        {
+          *region = (ltb >> 1);
+          mant = 1;
+          mant = res[1] & ((mant << (bitsper - resexp)) - 1);
+          while (mant < 0x0000000000010000)
+            {
+              i++;
+              mant = (mant << bitsper) | res[i];
+            }
+          nextbits = res[i+1];
+        }
+
+#ifdef DEBUGGING_PRINT
+      printf("First bits of mant = %s\n", d2b(mant, bitsper, -1));
+#endif
+
+      /* Normalize the mantissa. The shift value 6 here, determined by
+         trial and error, seems to give optimal speed. */
+      bc = 0;
+      while (mant < 0x0000400000000000LL)
+        {
+          bc += 6;
+          mant <<= 6;
+        }
+      while (mant < 0x0010000000000000LL)
+        {
+          bc++;
+          mant <<= 1;
+        }
+      mant |= nextbits >> (bitsper - bc);
+
+      rexp = 52 + resexp - bc - i * bitsper;
+
+#ifdef DEBUGGING_PRINT
+      printf("Normalised mantissa = 0x%016lx\n", mant);
+      printf("Exponent to be inserted on mantissa = rexp = %d\n", rexp);
+#endif
+
+      /* Put the result exponent rexp onto the mantissa pattern */
+      u = ((unsigned long long)rexp + EXPBIAS_DP64) << EXPSHIFTBITS_DP64;
+      ux = (mant & MANTBITS_DP64) | u;
+      if (determ)
+        /* If we negated the mantissa we negate x too */
+        ux |= SIGNBIT_DP64;
+      PUT_BITS_DP64(ux, dx);
+
+#ifdef DEBUGGING_PRINT
+      printf("(x*2/pi) = %25.20e = %s\n", dx, double2hex(&dx));
+#endif
+
+      /* x is a double precision version of the fractional part of
+         x * 2 / pi. Multiply x by pi/2 in double precision
+         to get the reduced argument r. */
+      *r = dx * piby2;
+
+#ifdef DEBUGGING_PRINT
+      printf(" r = frac(x*2/pi) * pi/2:\n");
+      printf(" r = %25.20e = %s\n", *r, double2hex(r));
+      printf("region = (number of pi/2 subtracted from x) mod 4 = %d\n",
+             *region);
+#endif
+}
+#endif /* USE_REMAINDER_PIBY2F_INLINE */
+
+#if defined(WINDOWS)
+#if defined(USE_HANDLE_ERROR) || defined(USE_HANDLE_ERRORF)
+#include <errno.h>
+#endif
+
+#if defined(USE_HANDLE_ERROR)
+/* Define the Microsoft specific error handling routines */
+static __declspec(noinline) double handle_error(const char *name,
+                                                unsigned long long value,
+                                                int type, int flags, int error,
+                                                double arg1, double arg2)
+{
+  double z;
+  struct _exception exception_data;
+  exception_data.type = type;
+  exception_data.name = (char*)name;
+  exception_data.arg1 = arg1;
+  exception_data.arg2 = arg2;
+  PUT_BITS_DP64(value, z);
+  exception_data.retval = z;
+  raise_fpsw_flags(flags);
+  if (!_matherr(&exception_data))
+    {
+      errno = error;
+    }
+  return exception_data.retval;
+}
+#endif /* USE_HANDLE_ERROR */
+
+#if defined(USE_HANDLE_ERRORF)
+static __declspec(noinline) float handle_errorf(const char *name,
+                                                unsigned int value,
+                                                int type, int flags, int error,
+                                                float arg1, float arg2)
+{
+  float z;
+  struct _exception exception_data;
+  exception_data.type = type;
+  exception_data.name = (char*)name;
+  exception_data.arg1 = (double)arg1;
+  exception_data.arg2 = (double)arg2;
+  PUT_BITS_SP32(value, z);
+  exception_data.retval = z;
+  raise_fpsw_flags(flags);
+  if (!_matherr(&exception_data))
+    {
+      errno = error;
+    }
+  return (float)exception_data.retval;
+}
+#endif /* USE_HANDLE_ERRORF */
+#endif /* WINDOWS */
+
+#endif /* LIBM_INLINES_AMD_H_INCLUDED */
diff --git a/inc/libm_special.h b/inc/libm_special.h
new file mode 100644
index 0000000..0833b7b
--- /dev/null
+++ b/inc/libm_special.h
@@ -0,0 +1,84 @@
+
+/*
+*  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+*
+*  This file is part of libacml_mv.
+*
+*  libacml_mv is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  libacml_mv is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with libacml_mv.  If not, see
+*  <http://www.gnu.org/licenses/>.
+*
+*/
+
+
+#ifndef __LIBM_SPECIAL_H__
+#define __LIBM_SPECIAL_H__
+
+// exception status set
+#define MXCSR_ES_INEXACT       0x00000020
+#define MXCSR_ES_UNDERFLOW     0x00000010
+#define MXCSR_ES_OVERFLOW      0x00000008
+#define MXCSR_ES_DIVBYZERO     0x00000004
+#define MXCSR_ES_INVALID       0x00000001
+
+void __amd_handle_errorf(int type, int error, const char *name,
+                    float arg1, unsigned int arg1_is_snan,
+                    float arg2, unsigned int arg2_is_snan,
+                    float retval, unsigned int retval_is_snan);
+
+void __amd_handle_error(int type, int error, const char *name,
+                   double arg1,
+                   double arg2,
+                   double retval);
+
+/* Code from GRTE/v4 math.h */
+/* Types of exceptions in the `type' field.  */
+#ifndef DOMAIN
+struct exception
+  {
+    int type;
+    char *name;
+    double arg1;
+    double arg2;
+    double retval;
+  };
+
+extern int matherr (struct exception *__exc);
+
+# define X_TLOSS        1.41484755040568800000e+16
+
+/* Types of exceptions in the `type' field.  */
+# define DOMAIN         1
+# define SING           2
+# define OVERFLOW       3
+# define UNDERFLOW      4
+# define TLOSS          5
+# define PLOSS          6
+
+/* SVID mode specifies returning this large value instead of infinity.  */
+# define HUGE           3.40282347e+38F
+
+/* Use this define to enable a (dummy) definition of matherr().  */
+#define NEED_FAKE_MATHERR
+
+#else   /* !SVID */
+
+# ifdef __USE_XOPEN
+/* X/Open wants another strange constant.  */
+#  define MAXFLOAT      3.40282347e+38F
+# endif
+
+#endif  /* DOMAIN */
+/* Code from GRTE/v4 math.h */
+
+#endif // __LIBM_SPECIAL_H__
diff --git a/inc/libm_util_amd.h b/inc/libm_util_amd.h
new file mode 100644
index 0000000..f7347d0
--- /dev/null
+++ b/inc/libm_util_amd.h
@@ -0,0 +1,195 @@
+
+/*
+*  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+*
+*  This file is part of libacml_mv.
+*
+*  libacml_mv is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  libacml_mv is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with libacml_mv.  If not, see
+*  <http://www.gnu.org/licenses/>.
+*
+*/
+
+
+#ifndef LIBM_UTIL_AMD_H_INCLUDED
+#define LIBM_UTIL_AMD_H_INCLUDED 1
+
+
+
+
+
+
+typedef float F32;
+typedef unsigned int U32;
+typedef int S32;
+
+typedef double F64;
+typedef unsigned long long  U64;
+typedef long long S64;
+
+union UT32_ 
+{
+    F32 f32;
+    U32 u32;
+};
+
+union UT64_ 
+{
+    F64 f64;
+    U64 u64;
+    
+    F32 f32[2];
+    U32 u32[2];
+};
+
+typedef union UT32_ UT32;
+typedef union UT64_ UT64;
+
+
+
+
+#define QNAN_MASK_32        0x00400000
+#define QNAN_MASK_64        0x0008000000000000
+
+
+#define MULTIPLIER_SP 24
+#define MULTIPLIER_DP 53
+
+static const double VAL_2PMULTIPLIER_DP =  9007199254740992.0;
+static const double VAL_2PMMULTIPLIER_DP = 1.1102230246251565404236316680908e-16;
+static const float VAL_2PMULTIPLIER_SP =  16777216.0F;
+static const float VAL_2PMMULTIPLIER_SP = 5.9604645e-8F;
+
+
+
+
+
+/* Definitions for double functions on 64 bit machines */
+#define SIGNBIT_DP64      0x8000000000000000
+#define EXPBITS_DP64      0x7ff0000000000000
+#define MANTBITS_DP64     0x000fffffffffffff
+#define ONEEXPBITS_DP64   0x3ff0000000000000
+#define TWOEXPBITS_DP64   0x4000000000000000
+#define HALFEXPBITS_DP64  0x3fe0000000000000
+#define IMPBIT_DP64       0x0010000000000000
+#define QNANBITPATT_DP64  0x7ff8000000000000
+#define INDEFBITPATT_DP64 0xfff8000000000000
+#define PINFBITPATT_DP64  0x7ff0000000000000
+#define NINFBITPATT_DP64  0xfff0000000000000
+#define EXPBIAS_DP64      1023
+#define EXPSHIFTBITS_DP64 52
+#define BIASEDEMIN_DP64   1
+#define EMIN_DP64         -1022
+#define BIASEDEMAX_DP64   2046
+#define EMAX_DP64         1023
+#define LAMBDA_DP64       1.0e300
+#define MANTLENGTH_DP64   53
+#define BASEDIGITS_DP64   15
+
+
+/* These definitions, used by float functions,
+   are for both 32 and 64 bit machines */
+#define SIGNBIT_SP32      0x80000000
+#define EXPBITS_SP32      0x7f800000
+#define MANTBITS_SP32     0x007fffff
+#define ONEEXPBITS_SP32   0x3f800000
+#define TWOEXPBITS_SP32   0x40000000
+#define HALFEXPBITS_SP32  0x3f000000
+#define IMPBIT_SP32       0x00800000
+#define QNANBITPATT_SP32  0x7fc00000
+#define INDEFBITPATT_SP32 0xffc00000
+#define PINFBITPATT_SP32  0x7f800000
+#define NINFBITPATT_SP32  0xff800000
+#define EXPBIAS_SP32      127
+#define EXPSHIFTBITS_SP32 23
+#define BIASEDEMIN_SP32   1
+#define EMIN_SP32         -126
+#define BIASEDEMAX_SP32   254
+#define EMAX_SP32         127
+#define LAMBDA_SP32       1.0e30
+#define MANTLENGTH_SP32   24
+#define BASEDIGITS_SP32   7
+
+#define CLASS_SIGNALLING_NAN 1
+#define CLASS_QUIET_NAN 2
+#define CLASS_NEGATIVE_INFINITY 3
+#define CLASS_NEGATIVE_NORMAL_NONZERO 4
+#define CLASS_NEGATIVE_DENORMAL 5
+#define CLASS_NEGATIVE_ZERO 6
+#define CLASS_POSITIVE_ZERO 7
+#define CLASS_POSITIVE_DENORMAL 8
+#define CLASS_POSITIVE_NORMAL_NONZERO 9
+#define CLASS_POSITIVE_INFINITY 10
+
+#define OLD_BITS_SP32(x) (*((unsigned int *)&x))
+#define OLD_BITS_DP64(x) (*((unsigned long long *)&x))
+
+/* Alternatives to the above functions which don't have
+   problems when using high optimization levels on gcc */
+#define GET_BITS_SP32(x, ux) \
+  { \
+    volatile union {float f; unsigned int i;} _bitsy; \
+    _bitsy.f = (x); \
+    ux = _bitsy.i; \
+  }
+#define PUT_BITS_SP32(ux, x) \
+  { \
+    volatile union {float f; unsigned int i;} _bitsy; \
+    _bitsy.i = (ux); \
+     x = _bitsy.f; \
+  }
+
+#define GET_BITS_DP64(x, ux) \
+  { \
+    volatile union {double d; unsigned long long i;} _bitsy; \
+    _bitsy.d = (x); \
+    ux = _bitsy.i; \
+  }
+#define PUT_BITS_DP64(ux, x) \
+  { \
+    volatile union {double d; unsigned long long i;} _bitsy; \
+    _bitsy.i = (ux); \
+    x = _bitsy.d; \
+  }
+
+
+/* Processor-dependent floating-point status flags */
+#define AMD_F_INEXACT 0x00000020
+#define AMD_F_UNDERFLOW 0x00000010
+#define AMD_F_OVERFLOW 0x00000008
+#define AMD_F_DIVBYZERO 0x00000004
+#define AMD_F_INVALID 0x00000001
+
+/* Processor-dependent floating-point precision-control flags */
+#define AMD_F_EXTENDED 0x00000300
+#define AMD_F_DOUBLE   0x00000200
+#define AMD_F_SINGLE   0x00000000
+
+/* Processor-dependent floating-point rounding-control flags */
+#define AMD_F_RC_NEAREST 0x00000000
+#define AMD_F_RC_DOWN    0x00002000
+#define AMD_F_RC_UP      0x00004000
+#define AMD_F_RC_ZERO    0x00006000
+
+/* How to get hold of an assembly square root instruction:
+ *   ASMQRT(x,y) computes y = sqrt(x).
+ */
+#ifdef WINDOWS
+/* VC++ intrinsic call */
+#define ASMSQRT(x,y) _mm_store_sd(&y, _mm_sqrt_sd(_mm_setzero_pd(), _mm_load_sd(&x)));
+#else
+/* Hammer sqrt instruction */
+#define ASMSQRT(x,y) asm volatile ("sqrtsd %1, %0" : "=x" (y) : "x" (x));
+#endif
+
+#endif /* LIBM_UTIL_AMD_H_INCLUDED */
diff --git a/libacml.h b/libacml.h
new file mode 100644
index 0000000..92c2ccb
--- /dev/null
+++ b/libacml.h
@@ -0,0 +1,76 @@
+// Copyright 2010 and onwards Google Inc.
+// Author: Martin Thuresson
+//
+// Expose fast k8 implementation of math functions with the prefix
+// "acml_".  Currently acml_log(), acml_exp(), and acmp_pow() have
+// shown to have significantly better performance over glibc libm
+// and atleast as good precision.
+// https://wiki.corp.google.com/twiki/bin/view/Main/CompilerMathOptimization
+//
+// When build with --cpu=piii, acml_* will call the pure libm functions,
+// avoiding the need to special case the calls.
+//
+// TODO(martint): Update glibc to match the libacml performance.
+
+#ifndef THIRD_PARTY__OPEN64_LIBACML_MV__LIBACML_H_
+#define THIRD_PARTY__OPEN64_LIBACML_MV__LIBACML_H_
+
+#ifndef USE_LIBACML_IMPLEMENTATION
+#define USE_LIBACML_IMPLEMENTATION defined(__x86_64__)
+#endif
+
+#if USE_LIBACML_IMPLEMENTATION
+#include "third_party/open64_libacml_mv/inc/fn_macros.h"
+#else
+#include <math.h>
+#endif
+
+extern "C" {
+
+#if USE_LIBACML_IMPLEMENTATION
+// The k8 implementation of the math functions.
+#define acml_exp_k8 FN_PROTOTYPE(exp)
+#define acml_expf_k8 FN_PROTOTYPE(expf)
+#define acml_log_k8 FN_PROTOTYPE(log)
+#define acml_pow_k8 FN_PROTOTYPE(pow)
+double acml_exp_k8(double x);
+float acml_expf_k8(float x);
+double acml_log_k8(double x);
+double acml_pow_k8(double x, double y);
+#endif
+
+static inline double acml_exp(double x) {
+#if USE_LIBACML_IMPLEMENTATION
+  return acml_exp_k8(x);
+#else
+  return exp(x);
+#endif
+}
+
+static inline float acml_expf(float x) {
+#if USE_LIBACML_IMPLEMENTATION
+  return acml_expf_k8(x);
+#else
+  return expf(x);
+#endif
+}
+
+static inline double acml_log(double x) {
+#if USE_LIBACML_IMPLEMENTATION
+  return acml_log_k8(x);
+#else
+  return log(x);
+#endif
+}
+
+static inline double acml_pow(double x, double y) {
+#if USE_LIBACML_IMPLEMENTATION
+  return acml_pow_k8(x, y);
+#else
+  return pow(x, y);
+#endif
+}
+
+}
+
+#endif  // THIRD_PARTY__OPEN64_LIBACML_MV__LIBACML_H_
diff --git a/libacml_portability_test.cc b/libacml_portability_test.cc
new file mode 100644
index 0000000..1f62d1a
--- /dev/null
+++ b/libacml_portability_test.cc
@@ -0,0 +1,16 @@
+#include "testing/base/public/gmock.h"
+#include "testing/base/public/gunit.h"
+#include "third_party/open64_libacml_mv/libacml.h"
+
+namespace {
+
+using ::testing::Eq;
+
+TEST(LibacmlPortabilityTest, Trivial) {
+  EXPECT_THAT(acml_exp(0), Eq(1));
+  EXPECT_THAT(acml_expf(0), Eq(1));
+  EXPECT_THAT(acml_pow(2, 2), Eq(4));
+  EXPECT_THAT(acml_log(1), Eq(0));
+}
+
+}  // namespace
diff --git a/src/acos.c b/src/acos.c
new file mode 100644
index 0000000..26bac6c
--- /dev/null
+++ b/src/acos.c
@@ -0,0 +1,183 @@
+
+/*
+*  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+*
+*  This file is part of libacml_mv.
+*
+*  libacml_mv is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  libacml_mv is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with libacml_mv.  If not, see
+*  <http://www.gnu.org/licenses/>.
+*
+*/
+
+
+#include "../inc/libm_amd.h"
+#include "../inc/libm_util_amd.h"
+
+#define USE_VAL_WITH_FLAGS
+#define USE_NAN_WITH_FLAGS
+#define USE_HANDLE_ERROR
+#include "../inc/libm_inlines_amd.h"
+#undef USE_NAN_WITH_FLAGS
+#undef USE_VAL_WITH_FLAGS
+#undef USE_HANDLE_ERROR
+
+#include "../inc/libm_errno_amd.h"
+
+#ifndef WINDOWS
+/* Deal with errno for out-of-range argument */
+static inline double retval_errno_edom(double x)
+{
+  struct exception exc;
+  exc.arg1 = x;
+  exc.arg2 = x;
+  exc.name = (char *)"acos";
+  exc.type = DOMAIN;
+  if (_LIB_VERSION == _SVID_)
+    exc.retval = HUGE;
+  else
+    exc.retval = nan_with_flags(AMD_F_INVALID);
+  if (_LIB_VERSION == _POSIX_)
+    __set_errno(EDOM);
+  else if (!matherr(&exc))
+    {
+      if(_LIB_VERSION == _SVID_)
+        (void)fputs("acos: DOMAIN error\n", stderr);
+    __set_errno(EDOM);
+    }
+  return exc.retval;
+}
+#endif
+
+#ifdef WINDOWS
+#pragma function(acos)
+#endif
+
+double FN_PROTOTYPE(acos)(double x)
+{
+  /* Computes arccos(x).
+     The argument is first reduced by noting that arccos(x)
+     is invalid for abs(x) > 1. For denormal and small
+     arguments arccos(x) = pi/2 to machine accuracy.
+     Remaining argument ranges are handled as follows.
+     For abs(x) <= 0.5 use
+     arccos(x) = pi/2 - arcsin(x)
+     = pi/2 - (x + x^3*R(x^2))
+     where R(x^2) is a rational minimax approximation to
+     (arcsin(x) - x)/x^3.
+     For abs(x) > 0.5 exploit the identity:
+     arccos(x) = pi - 2*arcsin(sqrt(1-x)/2)
+     together with the above rational approximation, and
+     reconstruct the terms carefully.
+  */
+
+  /* Some constants and split constants. */
+
+  static const double
+    pi         = 3.1415926535897933e+00, /* 0x400921fb54442d18 */
+    piby2      = 1.5707963267948965580e+00, /* 0x3ff921fb54442d18 */
+    piby2_head = 1.5707963267948965580e+00, /* 0x3ff921fb54442d18 */
+    piby2_tail = 6.12323399573676603587e-17; /* 0x3c91a62633145c07 */
+
+  double u, y, s=0.0, r;
+  int xexp, xnan, transform=0;
+
+  unsigned long long ux, aux, xneg;
+  GET_BITS_DP64(x, ux);
+  aux = ux & ~SIGNBIT_DP64;
+  xneg = (ux & SIGNBIT_DP64);
+  xnan = (aux > PINFBITPATT_DP64);
+  xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64;
+
+  /* Special cases */
+
+  if (xnan)
+    {
+#ifdef WINDOWS
+      return handle_error("acos", ux|0x0008000000000000, _DOMAIN,
+                          0, EDOM, x, 0.0);
+#else
+      return x + x; /* With invalid if it's a signalling NaN */
+#endif
+    }
+  else if (xexp < -56)
+    { /* y small enough that arccos(x) = pi/2 */
+      return val_with_flags(piby2, AMD_F_INEXACT);
+    }
+  else if (xexp >= 0)
+    { /* abs(x) >= 1.0 */
+      if (x == 1.0)
+        return 0.0;
+      else if (x == -1.0)
+        return val_with_flags(pi, AMD_F_INEXACT);
+      else
+#ifdef WINDOWS
+        return handle_error("acos", INDEFBITPATT_DP64, _DOMAIN,
+                            AMD_F_INVALID, EDOM, x, 0.0);
+#else
+        return retval_errno_edom(x);
+#endif
+    }
+
+  if (xneg) y = -x;
+  else y = x;
+
+  transform = (xexp >= -1); /* abs(x) >= 0.5 */
+
+  if (transform)
+    { /* Transform y into the range [0,0.5) */
+      r = 0.5*(1.0 - y);
+#ifdef WINDOWS
+      /* VC++ intrinsic call */
+      _mm_store_sd(&s, _mm_sqrt_sd(_mm_setzero_pd(), _mm_load_sd(&r)));
+#else
+      /* Hammer sqrt instruction */
+      asm volatile ("sqrtsd %1, %0" : "=x" (s) : "x" (r));
+#endif
+      y = s;
+    }
+  else
+    r = y*y;
+
+  /* Use a rational approximation for [0.0, 0.5] */
+
+  u = r*(0.227485835556935010735943483075 +
+         (-0.445017216867635649900123110649 +
+          (0.275558175256937652532686256258 +
+           (-0.0549989809235685841612020091328 +
+            (0.00109242697235074662306043804220 +
+             0.0000482901920344786991880522822991*r)*r)*r)*r)*r)/
+    (1.36491501334161032038194214209 +
+     (-3.28431505720958658909889444194 +
+      (2.76568859157270989520376345954 +
+       (-0.943639137032492685763471240072 +
+	0.105869422087204370341222318533*r)*r)*r)*r);
+
+  if (transform)
+    { /* Reconstruct acos carefully in transformed region */
+      if (xneg) return pi - 2.0*(s+(y*u - piby2_tail));
+      else
+	{
+	  double c, s1;
+	  unsigned long long us;
+	  GET_BITS_DP64(s, us);
+	  PUT_BITS_DP64(0xffffffff00000000 & us, s1);
+	  c = (r-s1*s1)/(s+s1);
+          return 2.0*s1 + (2.0*c+2.0*y*u);
+	}
+    }
+  else
+    return piby2_head - (x - (piby2_tail - x*u));
+}
+
+weak_alias (__acos, acos)
diff --git a/src/acosf.c b/src/acosf.c
new file mode 100644
index 0000000..4464661
--- /dev/null
+++ b/src/acosf.c
@@ -0,0 +1,181 @@
+
+/*
+*  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+*
+*  This file is part of libacml_mv.
+*
+*  libacml_mv is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  libacml_mv is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with libacml_mv.  If not, see
+*  <http://www.gnu.org/licenses/>.
+*
+*/
+
+
+#include "../inc/libm_amd.h"
+#include "../inc/libm_util_amd.h"
+
+#define USE_VALF_WITH_FLAGS
+#define USE_NANF_WITH_FLAGS
+#define USE_HANDLE_ERRORF
+#include "../inc/libm_inlines_amd.h"
+#undef USE_NANF_WITH_FLAGS
+#undef USE_VALF_WITH_FLAGS
+#undef USE_HANDLE_ERRORF
+
+#include "../inc/libm_errno_amd.h"
+
+#ifndef WINDOWS
+/* Deal with errno for out-of-range argument */
+static inline float retval_errno_edom(float x)
+{
+  struct exception exc;
+  exc.arg1 = (double)x;
+  exc.arg2 = (double)x;
+  exc.name = (char *)"acosf";
+  exc.type = DOMAIN;
+  if (_LIB_VERSION == _SVID_)
+    exc.retval = HUGE;
+  else
+    exc.retval = nanf_with_flags(AMD_F_INVALID);
+  if (_LIB_VERSION == _POSIX_)
+    __set_errno(EDOM);
+  else if (!matherr(&exc))
+    {
+      if(_LIB_VERSION == _SVID_)
+        (void)fputs("acosf: DOMAIN error\n", stderr);
+    __set_errno(EDOM);
+    }
+  return exc.retval;
+}
+#endif
+
+#ifdef WINDOWS
+#pragma function(acosf)
+#endif
+
+float FN_PROTOTYPE(acosf)(float x)
+{
+  /* Computes arccos(x).
+     The argument is first reduced by noting that arccos(x)
+     is invalid for abs(x) > 1. For denormal and small
+     arguments arccos(x) = pi/2 to machine accuracy.
+     Remaining argument ranges are handled as follows.
+     For abs(x) <= 0.5 use
+     arccos(x) = pi/2 - arcsin(x)
+     = pi/2 - (x + x^3*R(x^2))
+     where R(x^2) is a rational minimax approximation to
+     (arcsin(x) - x)/x^3.
+     For abs(x) > 0.5 exploit the identity:
+     arccos(x) = pi - 2*arcsin(sqrt(1-x)/2)
+     together with the above rational approximation, and
+     reconstruct the terms carefully.
+  */
+
+  /* Some constants and split constants. */
+
+  static const float
+    piby2      = 1.5707963705e+00F; /* 0x3fc90fdb */
+  static const double
+    pi         = 3.1415926535897933e+00, /* 0x400921fb54442d18 */
+    piby2_head = 1.5707963267948965580e+00, /* 0x3ff921fb54442d18 */
+    piby2_tail = 6.12323399573676603587e-17; /* 0x3c91a62633145c07 */
+
+  float u, y, s = 0.0F, r;
+  int xexp, xnan, transform = 0;
+
+  unsigned int ux, aux, xneg;
+
+  GET_BITS_SP32(x, ux);
+  aux = ux & ~SIGNBIT_SP32;
+  xneg = (ux & SIGNBIT_SP32);
+  xnan = (aux > PINFBITPATT_SP32);
+  xexp = (int)((ux & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+
+  /* Special cases */
+
+  if (xnan)
+    {
+#ifdef WINDOWS
+      return handle_errorf("acosf", ux|0x00400000, _DOMAIN, 0,
+                           EDOM, x, 0.0F);
+#else
+      return x + x; /* With invalid if it's a signalling NaN */
+#endif
+    }
+  else if (xexp < -26)
+    /* y small enough that arccos(x) = pi/2 */
+    return valf_with_flags(piby2, AMD_F_INEXACT);
+  else if (xexp >= 0)
+    { /* abs(x) >= 1.0 */
+      if (x == 1.0F)
+        return 0.0F;
+      else if (x == -1.0F)
+        return valf_with_flags((float)pi, AMD_F_INEXACT);
+      else
+#ifdef WINDOWS
+        return handle_errorf("acosf", INDEFBITPATT_SP32, _DOMAIN,
+                             AMD_F_INVALID, EDOM, x, 0.0F);
+#else
+        return retval_errno_edom(x);
+#endif
+    }
+
+  if (xneg) y = -x;
+  else y = x;
+
+  transform = (xexp >= -1); /* abs(x) >= 0.5 */
+
+  if (transform)
+    { /* Transform y into the range [0,0.5) */
+      r = 0.5F*(1.0F - y);
+#ifdef WINDOWS
+      /* VC++ intrinsic call */
+      _mm_store_ss(&s, _mm_sqrt_ss(_mm_load_ss(&r)));
+#else
+      /* Hammer sqrt instruction */
+      asm volatile ("sqrtss %1, %0" : "=x" (s) : "x" (r));
+#endif
+      y = s;
+    }
+  else
+    r = y*y;
+
+  /* Use a rational approximation for [0.0, 0.5] */
+
+  u=r*(0.184161606965100694821398249421F +
+       (-0.0565298683201845211985026327361F +
+	(-0.0133819288943925804214011424456F -
+	 0.00396137437848476485201154797087F*r)*r)*r)/
+    (1.10496961524520294485512696706F -
+     0.836411276854206731913362287293F*r);
+
+  if (transform)
+    {
+      /* Reconstruct acos carefully in transformed region */
+      if (xneg)
+        return (float)(pi - 2.0*(s+(y*u - piby2_tail)));
+      else
+	{
+	  float c, s1;
+	  unsigned int us;
+	  GET_BITS_SP32(s, us);
+	  PUT_BITS_SP32(0xffff0000 & us, s1);
+	  c = (r-s1*s1)/(s+s1);
+          return 2.0F*s1 + (2.0F*c+2.0F*y*u);
+	}
+    }
+  else
+    return (float)(piby2_head - (x - (piby2_tail - x*u)));
+}
+
+weak_alias (__acosf, acosf)
diff --git a/src/acosh.c b/src/acosh.c
new file mode 100644
index 0000000..f1d62c6
--- /dev/null
+++ b/src/acosh.c
@@ -0,0 +1,447 @@
+
+/*
+*  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+*
+*  This file is part of libacml_mv.
+*
+*  libacml_mv is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  libacml_mv is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with libacml_mv.  If not, see
+*  <http://www.gnu.org/licenses/>.
+*
+*/
+
+
+#include "../inc/libm_amd.h"
+#include "../inc/libm_util_amd.h"
+
+#define USE_NAN_WITH_FLAGS
+#define USE_HANDLE_ERROR
+#define USE_LOG_KERNEL_AMD
+#include "../inc/libm_inlines_amd.h"
+#undef USE_NAN_WITH_FLAGS
+#undef USE_HANDLE_ERROR
+#undef USE_LOG_KERNEL_AMD
+
+#include "../inc/libm_errno_amd.h"
+
+#ifndef WINDOWS
+/* Deal with errno for out-of-range argument */
+static inline double retval_errno_edom(double x)
+{
+  struct exception exc;
+  exc.arg1 = x;
+  exc.arg2 = x;
+  exc.type = DOMAIN;
+  exc.name = (char *)"acosh";
+  if (_LIB_VERSION == _SVID_)
+    exc.retval = -HUGE;
+  else
+    exc.retval = nan_with_flags(AMD_F_INVALID);
+  if (_LIB_VERSION == _POSIX_)
+    __set_errno(EDOM);
+  else if (!matherr(&exc))
+    {
+      if(_LIB_VERSION == _SVID_)
+        (void)fputs("acosh: DOMAIN error\n", stderr);
+    __set_errno(EDOM);
+    }
+  return exc.retval;
+}
+#endif
+
+#undef _FUNCNAME
+#define _FUNCNAME "acosh"
+double FN_PROTOTYPE(acosh)(double x)
+{
+
+  unsigned long long ux;
+  double r, rarg, r1, r2;
+  int xexp;
+
+  static const unsigned long long
+    recrteps = 0x4196a09e667f3bcd; /* 1/sqrt(eps) = 9.49062656242515593767e+07 */
+  /* log2_lead and log2_tail sum to an extra-precise version
+     of log(2) */
+
+  static const double
+    log2_lead = 6.93147122859954833984e-01,  /* 0x3fe62e42e0000000 */
+    log2_tail = 5.76999904754328540596e-08;  /* 0x3e6efa39ef35793c */
+
+
+  GET_BITS_DP64(x, ux);
+
+  if ((ux & EXPBITS_DP64) == EXPBITS_DP64)
+    {
+      /* x is either NaN or infinity */
+      if (ux & MANTBITS_DP64)
+        {
+          /* x is NaN */
+#ifdef WINDOWS
+          return handle_error(_FUNCNAME, ux|0x0008000000000000, _DOMAIN,
+                              AMD_F_INVALID, EDOM, x, 0.0);
+#else
+          return x + x; /* Raise invalid if it is a signalling NaN */
+#endif
+        }
+      else
+        {
+          /* x is infinity */
+          if (ux & SIGNBIT_DP64)
+            /* x is negative infinity. Return a NaN. */
+#ifdef WINDOWS
+            return handle_error(_FUNCNAME, INDEFBITPATT_DP64, _DOMAIN,
+                                AMD_F_INVALID, EDOM, x, 0.0);
+#else
+            return retval_errno_edom(x);
+#endif
+          else
+            /* Return positive infinity with no signal */
+            return x;
+        }
+    }
+  else if ((ux & SIGNBIT_DP64) || (ux <= 0x3ff0000000000000))
+    {
+      /* x <= 1.0 */
+      if (ux == 0x3ff0000000000000)
+        {
+          /* x = 1.0; return zero. */
+          return 0.0;
+        }
+      else
+        {
+          /* x is less than 1.0. Return a NaN. */
+#ifdef WINDOWS
+          return handle_error(_FUNCNAME, INDEFBITPATT_DP64, _DOMAIN,
+                              AMD_F_INVALID, EDOM, x, 0.0);
+#else
+          return retval_errno_edom(x);
+#endif
+        }
+    }
+
+
+  if (ux > recrteps)
+    {
+      /* Arguments greater than 1/sqrt(epsilon) in magnitude are
+         approximated by acosh(x) = ln(2) + ln(x) */
+      /* log_kernel_amd(x) returns xexp, r1, r2 such that
+         log(x) = xexp*log(2) + r1 + r2 */
+      log_kernel_amd64(x, ux, &xexp, &r1, &r2);
+      /* Add (xexp+1) * log(2) to z1,z2 to get the result acosh(x).
+         The computed r1 is not subject to rounding error because
+         (xexp+1) has at most 10 significant bits, log(2) has 24 significant
+         bits, and r1 has up to 24 bits; and the exponents of r1
+         and r2 differ by at most 6. */
+      r1 = ((xexp+1) * log2_lead + r1);
+      r2 = ((xexp+1) * log2_tail + r2);
+      return r1 + r2;
+    }
+  else if (ux >= 0x4060000000000000)
+    {
+      /* 128.0 <= x <= 1/sqrt(epsilon) */
+      /* acosh for these arguments is approximated by
+         acosh(x) = ln(x + sqrt(x*x-1)) */
+      rarg = x*x-1.0;
+      /* Use assembly instruction to compute r = sqrt(rarg); */
+      ASMSQRT(rarg,r);
+      r += x;
+      GET_BITS_DP64(r, ux);
+      log_kernel_amd64(r, ux, &xexp, &r1, &r2);
+      r1 = (xexp * log2_lead + r1);
+      r2 = (xexp * log2_tail + r2);
+      return r1 + r2;
+    }
+  else
+    {
+      /* 1.0 < x <= 128.0 */
+      double u1, u2, v1, v2, w1, w2, hx, tx, t, r, s, p1, p2, a1, a2, c1, c2,
+        poly;
+      if (ux >= 0x3ff8000000000000)
+        {
+          /* 1.5 <= x <= 128.0 */
+          /* We use minimax polynomials,
+             based on Abramowitz and Stegun 4.6.32 series
+             expansion for acosh(x), with the log(2x) and 1/(2.2.x^2)
+             terms removed. We compensate for these two terms later.
+          */
+          t = x*x;
+          if (ux >= 0x4040000000000000)
+            {
+              /* [3,2] for 32.0 <= x <= 128.0 */
+              poly =
+                (0.45995704464157438175e-9 +
+                 (-0.89080839823528631030e-9 +
+                  (-0.10370522395596168095e-27 +
+                   0.35255386405811106347e-32 * t) * t) * t) /
+                (0.21941191335882074014e-8 +
+                 (-0.10185073058358334569e-7 +
+                  0.95019562478430648685e-8 * t) * t);
+            }
+          else if (ux >= 0x4020000000000000)
+            {
+              /* [3,3] for 8.0 <= x <= 32.0 */
+              poly =
+                (-0.54903656589072526589e-10 +
+                 (0.27646792387218569776e-9 +
+                  (-0.26912957240626571979e-9 -
+                   0.86712268396736384286e-29 * t) * t) * t) /
+                (-0.24327683788655520643e-9 +
+                 (0.20633757212593175571e-8 +
+                  (-0.45438330985257552677e-8 +
+                   0.28707154390001678580e-8 * t) * t) * t);
+            }
+          else if (ux >= 0x4010000000000000)
+            {
+              /* [4,3] for 4.0 <= x <= 8.0 */
+              poly =
+                (-0.20827370596738166108e-6 +
+                 (0.10232136919220422622e-5 +
+                  (-0.98094503424623656701e-6 +
+                   (-0.11615338819596146799e-18 +
+                    0.44511847799282297160e-21 * t) * t) * t) * t) /
+                (-0.92579451630913718588e-6 +
+                 (0.76997374707496606639e-5 +
+                  (-0.16727286999128481170e-4 +
+                   0.10463413698762590251e-4 * t) * t) * t);
+            }
+          else if (ux >= 0x4000000000000000)
+            {
+              /* [5,5] for 2.0 <= x <= 4.0 */
+              poly =
+                (-0.122195030526902362060e-7 +
+                 (0.157894522814328933143e-6 +
+                  (-0.579951798420930466109e-6 +
+                   (0.803568881125803647331e-6 +
+                    (-0.373906657221148667374e-6 -
+                     0.317856399083678204443e-21 * t) * t) * t) * t) * t) /
+                (-0.516260096352477148831e-7 +
+                 (0.894662592315345689981e-6 +
+                  (-0.475662774453078218581e-5 +
+                   (0.107249291567405130310e-4 +
+                    (-0.107871445525891289759e-4 +
+                     0.398833767702587224253e-5 * t) * t) * t) * t) * t);
+            }
+          else if (ux >= 0x3ffc000000000000)
+            {
+              /* [5,4] for 1.75 <= x <= 2.0 */
+              poly =
+                (0.1437926821253825186e-3 +
+                 (-0.1034078230246627213e-2 +
+                  (0.2015310005461823437e-2 +
+                   (-0.1159685218876828075e-2 +
+                    (-0.9267353551307245327e-11 +
+                     0.2880267770324388034e-12 * t) * t) * t) * t) * t) /
+                (0.6305521447028109891e-3 +
+                 (-0.6816525887775002944e-2 +
+                  (0.2228081831550003651e-1 +
+                   (-0.2836886105406603318e-1 +
+                    0.1236997707206036752e-1 * t) * t) * t) * t);
+            }
+          else
+            {
+              /* [5,4] for 1.5 <= x <= 1.75 */
+              poly =
+                ( 0.7471936607751750826e-3 +
+                  (-0.4849405284371905506e-2 +
+                   (0.8823068059778393019e-2 +
+                    (-0.4825395461288629075e-2 +
+                     (-0.1001984320956564344e-8 +
+                      0.4299919281586749374e-10 * t) * t) * t) * t) * t) /
+                (0.3322359141239411478e-2 +
+                 (-0.3293525930397077675e-1 +
+                  (0.1011351440424239210e0 +
+                   (-0.1227083591622587079e0 +
+                    0.5147099404383426080e-1 * t) * t) * t) * t);
+            }
+          GET_BITS_DP64(x, ux);
+          log_kernel_amd64(x, ux, &xexp, &r1, &r2);
+          r1 = ((xexp+1) * log2_lead + r1);
+          r2 = ((xexp+1) * log2_tail + r2);
+          /* Now (r1,r2) sum to log(2x). Subtract the term
+             1/(2.2.x^2) = 0.25/t, and add poly/t, carefully
+             to maintain precision. (Note that we add poly/t
+             rather than poly because of the *x factor used
+             when generating the minimax polynomial) */
+          v2 = (poly-0.25)/t;
+          r = v2 + r1;
+          s = ((r1 - r) + v2) + r2;
+          v1 = r + s;
+          return v1 + ((r - v1) + s);
+        }
+
+      /* Here 1.0 <= x <= 1.5. It is hard to maintain accuracy here so
+         we have to go to great lengths to do so. */
+
+      /* We compute the value
+           t = x - 1.0 + sqrt(2.0*(x - 1.0) + (x - 1.0)*(x - 1.0))
+         using simulated quad precision. */
+      t = x - 1.0;
+      u1 = t * 2.0;
+
+      /* dekker_mul12(t,t,&v1,&v2); */
+      GET_BITS_DP64(t, ux);
+      ux &= 0xfffffffff8000000;
+      PUT_BITS_DP64(ux, hx);
+      tx = t - hx;
+      v1 = t * t;
+      v2 = (((hx * hx - v1) + hx * tx) + tx * hx) + tx * tx;
+
+      /* dekker_add2(u1,0.0,v1,v2,&w1,&w2); */
+      r = u1 + v1;
+      s = (((u1 - r) + v1) + v2);
+      w1 = r + s;
+      w2 = (r - w1) + s;
+
+      /* dekker_sqrt2(w1,w2,&u1,&u2); */
+      ASMSQRT(w1,p1);
+      GET_BITS_DP64(p1, ux);
+      ux &= 0xfffffffff8000000;
+      PUT_BITS_DP64(ux, c1);
+      c2 = p1 - c1;
+      a1 = p1 * p1;
+      a2 = (((c1 * c1 - a1) + c1 * c2) + c2 * c1) + c2 * c2;
+      p2 = (((w1 - a1) - a2) + w2) * 0.5 / p1;
+      u1 = p1 + p2;
+      u2 = (p1 - u1) + p2;
+
+      /* dekker_add2(u1,u2,t,0.0,&v1,&v2); */
+      r = u1 + t;
+      s = (((u1 - r) + t)) + u2;
+      r1 = r + s;
+      r2 = (r - r1) + s;
+      t = r1 + r2;
+
+      /* Check for x close to 1.0. */
+      if (x < 1.13)
+        {
+          /* Here 1.0 <= x < 1.13 implies r <= 0.656. In this region
+             we need to take extra care to maintain precision.
+             We have t = r1 + r2 = (x - 1.0 + sqrt(x*x-1.0))
+             to more than basic precision. We use the Taylor series
+             for log(1+x), with terms after the O(x*x) term
+             approximated by a [6,6] minimax polynomial. */
+          double b1, b2, c1, c2, e1, e2, q1, q2, c, cc, hr1, tr1, hpoly, tpoly, hq1, tq1, hr2, tr2;
+          poly =
+            (0.30893760556597282162e-21 +
+             (0.10513858797132174471e0 +
+              (0.27834538302122012381e0 +
+               (0.27223638654807468186e0 +
+                (0.12038958198848174570e0 +
+                 (0.23357202004546870613e-1 +
+                  (0.15208417992520237648e-2 +
+                   0.72741030690878441996e-7 * t) * t) * t) * t) * t) * t) * t) /
+            (0.31541576391396523486e0 +
+             (0.10715979719991342022e1 +
+              (0.14311581802952004012e1 +
+               (0.94928647994421895988e0 +
+                (0.32396235926176348977e0 +
+                 (0.52566134756985833588e-1 +
+                  0.30477895574211444963e-2 * t) * t) * t) * t) * t) * t);
+
+          /* Now we can compute the result r = acosh(x) = log1p(t)
+             using the formula t - 0.5*t*t + poly*t*t. Since t is
+             represented as r1+r2, the formula becomes
+             r = r1+r2 - 0.5*(r1+r2)*(r1+r2) + poly*(r1+r2)*(r1+r2).
+             Expanding out, we get
+               r = r1 + r2 - (0.5 + poly)*(r1*r1 + 2*r1*r2 + r2*r2)
+             and ignoring negligible quantities we get
+               r = r1 + r2 - 0.5*r1*r1 + r1*r2 + poly*t*t
+          */
+          if (x < 1.06)
+            {
+              double b, c, e;
+              b = r1*r2;
+              c = 0.5*r1*r1;
+              e = poly*t*t;
+              /* N.B. the order of additions and subtractions is important */
+              r = (((r2 - b) + e) - c) + r1;
+              return r;
+            }
+          else
+            {
+              /* For 1.06 <= x <= 1.13 we must evaluate in extended precision
+                 to reach about 1 ulp accuracy (in this range the simple code
+                 above only manages about 1.5 ulp accuracy) */
+
+              /* Split poly, r1 and r2 into head and tail sections */
+              GET_BITS_DP64(poly, ux);
+              ux &= 0xfffffffff8000000;
+              PUT_BITS_DP64(ux,hpoly);
+              tpoly = poly - hpoly;
+              GET_BITS_DP64(r1,ux);
+              ux &= 0xfffffffff8000000;
+              PUT_BITS_DP64(ux,hr1);
+              tr1 = r1 - hr1;
+              GET_BITS_DP64(r2, ux);
+              ux &= 0xfffffffff8000000;
+              PUT_BITS_DP64(ux,hr2);
+              tr2 = r2 - hr2;
+
+              /* e = poly*t*t */
+              c = poly * r1;
+              cc = (((hpoly * hr1 - c) + hpoly * tr1) + tpoly * hr1) + tpoly * tr1;
+              cc = poly * r2 + cc;
+              q1 = c + cc;
+              q2 = (c - q1) + cc;
+              GET_BITS_DP64(q1, ux);
+              ux &= 0xfffffffff8000000;
+              PUT_BITS_DP64(ux,hq1);
+              tq1 = q1 - hq1;
+              c = q1 * r1;
+              cc = (((hq1 * hr1 - c) + hq1 * tr1) + tq1 * hr1) + tq1 * tr1;
+              cc = q1 * r2 + q2 * r1 + cc;
+              e1 = c + cc;
+              e2 = (c - e1) + cc;
+
+              /* b = r1*r2 */
+              b1 = r1 * r2;
+              b2 = (((hr1 * hr2 - b1) + hr1 * tr2) + tr1 * hr2) + tr1 * tr2;
+
+              /* c = 0.5*r1*r1 */
+              c1 = (0.5*r1) * r1;
+              c2 = (((0.5*hr1 * hr1 - c1) + 0.5*hr1 * tr1) + 0.5*tr1 * hr1) + 0.5*tr1 * tr1;
+
+              /* v = a + d - b */
+              r = r1 - b1;
+              s = (((r1 - r) - b1) - b2) + r2;
+              v1 = r + s;
+              v2 = (r - v1) + s;
+
+              /* w = (a + d - b) - c */
+              r = v1 - c1;
+              s = (((v1 - r) - c1) - c2) + v2;
+              w1 = r + s;
+              w2 = (r - w1) + s;
+
+              /* u = ((a + d - b) - c) + e */
+              r = w1 + e1;
+              s = (((w1 - r) + e1) + e2) + w2;
+              u1 = r + s;
+              u2 = (r - u1) + s;
+
+              /* The result r = acosh(x) */
+              r = u1 + u2;
+
+              return r;
+            }
+        }
+      else
+        {
+          /* For arguments 1.13 <= x <= 1.5 the log1p function
+             is good enough */
+            return FN_PROTOTYPE(log1p)(t);
+        }
+    }
+}
+
+weak_alias (__acosh, acosh)
diff --git a/src/acoshf.c b/src/acoshf.c
new file mode 100644
index 0000000..c96fdb0
--- /dev/null
+++ b/src/acoshf.c
@@ -0,0 +1,149 @@
+
+/*
+*  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+*
+*  This file is part of libacml_mv.
+*
+*  libacml_mv is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  libacml_mv is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with libacml_mv.  If not, see
+*  <http://www.gnu.org/licenses/>.
+*
+*/
+
+
+#include "../inc/libm_amd.h"
+#include "../inc/libm_util_amd.h"
+
+#include <stdio.h>
+
+#define USE_NANF_WITH_FLAGS
+#define USE_HANDLE_ERRORF
+#include "../inc/libm_inlines_amd.h"
+#undef USE_NANF_WITH_FLAGS
+#undef USE_HANDLE_ERRORF
+
+#include "../inc/libm_errno_amd.h"
+
+#ifndef WINDOWS
+/* Deal with errno for out-of-range argument */
+static inline float retval_errno_edom(float x)
+{
+  struct exception exc;
+  exc.arg1 = (double)x;
+  exc.arg2 = (double)x;
+  exc.type = DOMAIN;
+  exc.name = (char *)"acoshf";
+  if (_LIB_VERSION == _SVID_)
+    exc.retval = -HUGE;
+  else
+    exc.retval = nanf_with_flags(AMD_F_INVALID);
+  if (_LIB_VERSION == _POSIX_)
+    __set_errno(EDOM);
+  else if (!matherr(&exc))
+    {
+      if(_LIB_VERSION == _SVID_)
+        (void)fputs("acoshf: DOMAIN error\n", stderr);
+    __set_errno(EDOM);
+    }
+  return exc.retval;
+}
+#endif
+
+#undef _FUNCNAME
+#define _FUNCNAME "acoshf"
+float FN_PROTOTYPE(acoshf)(float x)
+{
+
+  unsigned int ux;
+  double dx, r, rarg, t;
+
+  static const unsigned int
+    recrteps = 0x46000000; /* 1/sqrt(eps) = 4.09600000000000000000e+03 */
+
+  static const double
+    log2 = 6.93147180559945286227e-01;  /* 0x3fe62e42fefa39ef */
+
+  GET_BITS_SP32(x, ux);
+
+  if ((ux & EXPBITS_SP32) == EXPBITS_SP32)
+    {
+      /* x is either NaN or infinity */
+      if (ux & MANTBITS_SP32)
+        {
+          /* x is NaN */
+#ifdef WINDOWS
+          return handle_errorf(_FUNCNAME, ux|0x00400000, _DOMAIN,
+                               0, EDOM, x, 0.0F);
+#else
+          return x + x; /* Raise invalid if it is a signalling NaN */
+#endif
+        }
+      else
+        {
+          /* x is infinity */
+          if (ux & SIGNBIT_SP32)
+            /* x is negative infinity. Return a NaN. */
+#ifdef WINDOWS
+            return handle_errorf(_FUNCNAME, INDEFBITPATT_SP32, _DOMAIN,
+                                 AMD_F_INVALID, EDOM, x, 0.0F);
+#else
+            return retval_errno_edom(x);
+#endif
+          else
+            /* Return positive infinity with no signal */
+            return x;
+        }
+    }
+  else if ((ux & SIGNBIT_SP32) || (ux < 0x3f800000))
+    {
+      /* x is less than 1.0. Return a NaN. */
+#ifdef WINDOWS
+      return handle_errorf(_FUNCNAME, INDEFBITPATT_SP32, _DOMAIN,
+                           AMD_F_INVALID, EDOM, x, 0.0F);
+#else
+      return retval_errno_edom(x);
+#endif
+    }
+
+  dx = x;
+
+  if (ux > recrteps)
+    {
+      /* Arguments greater than 1/sqrt(epsilon) in magnitude are
+         approximated by acoshf(x) = ln(2) + ln(x) */
+      r = FN_PROTOTYPE(log)(dx) + log2;
+    }
+  else if (ux > 0x40000000)
+    {
+      /* 2.0 <= x <= 1/sqrt(epsilon) */
+      /* acoshf for these arguments is approximated by
+         acoshf(x) = ln(x + sqrt(x*x-1)) */
+      rarg = dx*dx-1.0;
+      /* Use assembly instruction to compute r = sqrt(rarg); */
+      ASMSQRT(rarg,r);
+      rarg = r + dx;
+      r = FN_PROTOTYPE(log)(rarg);
+    }
+  else
+    {
+      /* sqrt(epsilon) <= x <= 2.0 */
+      t = dx - 1.0;
+      rarg = 2.0*t + t*t;
+      ASMSQRT(rarg,r);  /* r = sqrt(rarg) */
+      rarg = t + r;
+      r = FN_PROTOTYPE(log1p)(rarg);
+    }
+  return (float)(r);
+}
+
+weak_alias (__acoshf, acoshf)
diff --git a/src/asin.c b/src/asin.c
new file mode 100644
index 0000000..0314dd8
--- /dev/null
+++ b/src/asin.c
@@ -0,0 +1,196 @@
+
+/*
+*  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+*
+*  This file is part of libacml_mv.
+*
+*  libacml_mv is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  libacml_mv is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with libacml_mv.  If not, see
+*  <http://www.gnu.org/licenses/>.
+*
+*/
+
+
+#include "../inc/libm_amd.h"
+#include "../inc/libm_util_amd.h"
+
+#define USE_VAL_WITH_FLAGS
+#define USE_NAN_WITH_FLAGS
+#define USE_HANDLE_ERROR
+#include "../inc/libm_inlines_amd.h"
+#undef USE_NAN_WITH_FLAGS
+#undef USE_VAL_WITH_FLAGS
+#undef USE_HANDLE_ERROR
+
+#include "../inc/libm_errno_amd.h"
+
+#ifndef WINDOWS
+/* Deal with errno for out-of-range argument */
+static inline double retval_errno_edom(double x)
+{
+  struct exception exc;
+  exc.arg1 = x;
+  exc.arg2 = x;
+  exc.type = DOMAIN;
+  exc.name = (char *)"asin";
+  if (_LIB_VERSION == _SVID_)
+    exc.retval = HUGE;
+  else
+    exc.retval = nan_with_flags(AMD_F_INVALID);
+  if (_LIB_VERSION == _POSIX_)
+    __set_errno(EDOM);
+  else if (!matherr(&exc))
+    {
+      if(_LIB_VERSION == _SVID_)
+        (void)fputs("asin: DOMAIN error\n", stderr);
+    __set_errno(EDOM);
+    }
+  return exc.retval;
+}
+#endif
+
+#ifdef WINDOWS
+#pragma function(asin)
+#endif
+
+double FN_PROTOTYPE(asin)(double x)
+{
+  /* Computes arcsin(x).
+     The argument is first reduced by noting that arcsin(x)
+     is invalid for abs(x) > 1 and arcsin(-x) = -arcsin(x).
+     For denormal and small arguments arcsin(x) = x to machine
+     accuracy. Remaining argument ranges are handled as follows.
+     For abs(x) <= 0.5 use
+     arcsin(x) = x + x^3*R(x^2)
+     where R(x^2) is a rational minimax approximation to
+     (arcsin(x) - x)/x^3.
+     For abs(x) > 0.5 exploit the identity:
+      arcsin(x) = pi/2 - 2*arcsin(sqrt(1-x)/2)
+     together with the above rational approximation, and
+     reconstruct the terms carefully.
+    */
+
+  /* Some constants and split constants. */
+
+  static const double
+    piby2_tail  = 6.1232339957367660e-17, /* 0x3c91a62633145c07 */
+    hpiby2_head = 7.8539816339744831e-01, /* 0x3fe921fb54442d18 */
+    piby2       = 1.5707963267948965e+00; /* 0x3ff921fb54442d18 */
+  double u, v, y, s=0.0, r;
+  int xexp, xnan, transform=0;
+
+  unsigned long long ux, aux, xneg;
+  GET_BITS_DP64(x, ux);
+  aux = ux & ~SIGNBIT_DP64;
+  xneg = (ux & SIGNBIT_DP64);
+  xnan = (aux > PINFBITPATT_DP64);
+  xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64;
+
+  /* Special cases */
+
+  if (xnan)
+    {
+#ifdef WINDOWS
+      return handle_error("asin", ux|0x0008000000000000, _DOMAIN,
+                          0, EDOM, x, 0.0);
+#else
+      return x + x; /* With invalid if it's a signalling NaN */
+#endif
+    }
+  else if (xexp < -28)
+    { /* y small enough that arcsin(x) = x */
+      return val_with_flags(x, AMD_F_INEXACT);
+    }
+  else if (xexp >= 0)
+    { /* abs(x) >= 1.0 */
+      if (x == 1.0)
+        return val_with_flags(piby2, AMD_F_INEXACT);
+      else if (x == -1.0)
+        return val_with_flags(-piby2, AMD_F_INEXACT);
+      else
+#ifdef WINDOWS
+        return handle_error("asin", INDEFBITPATT_DP64, _DOMAIN,
+                            AMD_F_INVALID, EDOM, x, 0.0);
+#else
+        return retval_errno_edom(x);
+#endif
+    }
+
+  if (xneg) y = -x;
+  else y = x;
+
+  transform = (xexp >= -1); /* abs(x) >= 0.5 */
+
+  if (transform)
+    { /* Transform y into the range [0,0.5) */
+      r = 0.5*(1.0 - y);
+#ifdef WINDOWS
+      /* VC++ intrinsic call */
+      _mm_store_sd(&s, _mm_sqrt_sd(_mm_setzero_pd(), _mm_load_sd(&r)));
+#else
+      /* Hammer sqrt instruction */
+      asm volatile ("sqrtsd %1, %0" : "=x" (s) : "x" (r));
+#endif
+      y = s;
+    }
+  else
+    r = y*y;
+
+  /* Use a rational approximation for [0.0, 0.5] */
+
+  u = r*(0.227485835556935010735943483075 +
+         (-0.445017216867635649900123110649 +
+          (0.275558175256937652532686256258 +
+           (-0.0549989809235685841612020091328 +
+            (0.00109242697235074662306043804220 +
+             0.0000482901920344786991880522822991*r)*r)*r)*r)*r)/
+    (1.36491501334161032038194214209 +
+     (-3.28431505720958658909889444194 +
+      (2.76568859157270989520376345954 +
+       (-0.943639137032492685763471240072 +
+        0.105869422087204370341222318533*r)*r)*r)*r);
+
+  if (transform)
+    { /* Reconstruct asin carefully in transformed region */
+        {
+          double c, s1, p, q;
+          unsigned long long us;
+          GET_BITS_DP64(s, us);
+          PUT_BITS_DP64(0xffffffff00000000 & us, s1);
+          c = (r-s1*s1)/(s+s1);
+          p = 2.0*s*u - (piby2_tail-2.0*c);
+          q = hpiby2_head - 2.0*s1;
+          v = hpiby2_head - (p-q);
+        }
+    }
+  else
+    {
+#ifdef WINDOWS
+      /* Use a temporary variable to prevent VC++ rearranging
+            y + y*u
+         into
+            y * (1 + u)
+         and getting an incorrectly rounded result */
+      double tmp;
+      tmp = y * u;
+      v = y + tmp;
+#else
+      v = y + y*u;
+#endif
+    }
+
+  if (xneg) return -v;
+  else return v;
+}
+
+weak_alias (__asin, asin)
diff --git a/src/asinf.c b/src/asinf.c
new file mode 100644
index 0000000..4b42b01
--- /dev/null
+++ b/src/asinf.c
@@ -0,0 +1,190 @@
+
+/*
+*  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+*
+*  This file is part of libacml_mv.
+*
+*  libacml_mv is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  libacml_mv is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with libacml_mv.  If not, see
+*  <http://www.gnu.org/licenses/>.
+*
+*/
+
+
+#include "../inc/libm_amd.h"
+#include "../inc/libm_util_amd.h"
+
+#define USE_VALF_WITH_FLAGS
+#define USE_NANF_WITH_FLAGS
+#define USE_HANDLE_ERRORF
+#include "../inc/libm_inlines_amd.h"
+#undef USE_NANF_WITH_FLAGS
+#undef USE_VALF_WITH_FLAGS
+#undef USE_HANDLE_ERRORF
+
+#include "../inc/libm_errno_amd.h"
+
+#ifndef WINDOWS
+/* Deal with errno for out-of-range argument */
+static inline float retval_errno_edom(float x)
+{
+  struct exception exc;
+  exc.arg1 = (double)x;
+  exc.arg2 = (double)x;
+  exc.type = DOMAIN;
+  exc.name = (char *)"asinf";
+  if (_LIB_VERSION == _SVID_)
+    exc.retval = HUGE;
+  else
+    exc.retval = nanf_with_flags(AMD_F_INVALID);
+  if (_LIB_VERSION == _POSIX_)
+    __set_errno(EDOM);
+  else if (!matherr(&exc))
+    {
+      if(_LIB_VERSION == _SVID_)
+        (void)fputs("asinf: DOMAIN error\n", stderr);
+    __set_errno(EDOM);
+    }
+  return exc.retval;
+}
+#endif
+
+#ifdef WINDOWS
+#pragma function(asinf)
+#endif
+
+float FN_PROTOTYPE(asinf)(float x)
+{
+  /* Computes arcsin(x).
+     The argument is first reduced by noting that arcsin(x)
+     is invalid for abs(x) > 1 and arcsin(-x) = -arcsin(x).
+     For denormal and small arguments arcsin(x) = x to machine
+     accuracy. Remaining argument ranges are handled as follows.
+     For abs(x) <= 0.5 use
+     arcsin(x) = x + x^3*R(x^2)
+     where R(x^2) is a rational minimax approximation to
+     (arcsin(x) - x)/x^3.
+     For abs(x) > 0.5 exploit the identity:
+      arcsin(x) = pi/2 - 2*arcsin(sqrt(1-x)/2)
+     together with the above rational approximation, and
+     reconstruct the terms carefully.
+    */
+
+  /* Some constants and split constants. */
+
+  static const float
+    piby2_tail  = 7.5497894159e-08F, /* 0x33a22168 */
+    hpiby2_head = 7.8539812565e-01F, /* 0x3f490fda */
+    piby2       = 1.5707963705e+00F; /* 0x3fc90fdb */
+  float u, v, y, s = 0.0F, r;
+  int xexp, xnan, transform = 0;
+
+  unsigned int ux, aux, xneg;
+  GET_BITS_SP32(x, ux);
+  aux = ux & ~SIGNBIT_SP32;
+  xneg = (ux & SIGNBIT_SP32);
+  xnan = (aux > PINFBITPATT_SP32);
+  xexp = (int)((ux & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+
+  /* Special cases */
+
+  if (xnan)
+    {
+#ifdef WINDOWS
+      return handle_errorf("asinf", ux|0x00400000, _DOMAIN, 0,
+                           EDOM, x, 0.0F);
+#else
+      return x + x; /* With invalid if it's a signalling NaN */
+#endif
+    }
+  else if (xexp < -14)
+    /* y small enough that arcsin(x) = x */
+    return valf_with_flags(x, AMD_F_INEXACT);
+  else if (xexp >= 0)
+    {
+      /* abs(x) >= 1.0 */
+      if (x == 1.0F)
+        return valf_with_flags(piby2, AMD_F_INEXACT);
+      else if (x == -1.0F)
+        return valf_with_flags(-piby2, AMD_F_INEXACT);
+      else
+#ifdef WINDOWS
+        return handle_errorf("asinf", INDEFBITPATT_SP32, _DOMAIN,
+                             AMD_F_INVALID, EDOM, x, 0.0F);
+#else
+        return retval_errno_edom(x);
+#endif
+    }
+
+  if (xneg) y = -x;
+  else y = x;
+
+  transform = (xexp >= -1); /* abs(x) >= 0.5 */
+
+  if (transform)
+    { /* Transform y into the range [0,0.5) */
+      r = 0.5F*(1.0F - y);
+#ifdef WINDOWS
+      /* VC++ intrinsic call */
+      _mm_store_ss(&s, _mm_sqrt_ss(_mm_load_ss(&r)));
+#else
+      /* Hammer sqrt instruction */
+      asm volatile ("sqrtss %1, %0" : "=x" (s) : "x" (r));
+#endif
+      y = s;
+    }
+  else
+    r = y*y;
+
+  /* Use a rational approximation for [0.0, 0.5] */
+
+  u=r*(0.184161606965100694821398249421F +
+       (-0.0565298683201845211985026327361F +
+	(-0.0133819288943925804214011424456F -
+	 0.00396137437848476485201154797087F*r)*r)*r)/
+    (1.10496961524520294485512696706F -
+     0.836411276854206731913362287293F*r);
+
+  if (transform)
+    {
+      /* Reconstruct asin carefully in transformed region */
+      float c, s1, p, q;
+      unsigned int us;
+      GET_BITS_SP32(s, us);
+      PUT_BITS_SP32(0xffff0000 & us, s1);
+      c = (r-s1*s1)/(s+s1);
+      p = 2.0F*s*u - (piby2_tail-2.0F*c);
+      q = hpiby2_head - 2.0F*s1;
+      v = hpiby2_head - (p-q);
+    }
+  else
+    {
+#ifdef WINDOWS
+      /* Use a temporary variable to prevent VC++ rearranging
+            y + y*u
+         into
+            y * (1 + u)
+         and getting an incorrectly rounded result */
+      float tmp;
+      tmp = y * u;
+      v = y + tmp;
+#else
+      v = y + y*u;
+#endif
+    }
+
+  if (xneg) return -v;
+  else return v;
+}
+
+weak_alias (__asinf, asinf)
diff --git a/src/asinh.c b/src/asinh.c
new file mode 100644
index 0000000..7ecde9c
--- /dev/null
+++ b/src/asinh.c
@@ -0,0 +1,322 @@
+
+/*
+*  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+*
+*  This file is part of libacml_mv.
+*
+*  libacml_mv is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  libacml_mv is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with libacml_mv.  If not, see
+*  <http://www.gnu.org/licenses/>.
+*
+*/
+
+
+#include "../inc/libm_amd.h"
+#include "../inc/libm_util_amd.h"
+
+#define USE_HANDLE_ERROR
+#define USE_LOG_KERNEL_AMD
+#define USE_VAL_WITH_FLAGS
+#include "../inc/libm_inlines_amd.h"
+#undef USE_HANDLE_ERROR
+#undef USE_LOG_KERNEL_AMD
+#undef VAL_WITH_FLAGS
+
+#undef _FUNCNAME
+#define _FUNCNAME "asinh"
+double FN_PROTOTYPE(asinh)(double x)
+{
+
+  unsigned long long ux, ax, xneg;
+  double absx, r, rarg, t, r1, r2, poly, s, v1, v2;
+  int xexp;
+
+  static const unsigned long long
+    rteps = 0x3e46a09e667f3bcd,    /* sqrt(eps) = 1.05367121277235086670e-08 */
+    recrteps = 0x4196a09e667f3bcd; /* 1/rteps = 9.49062656242515593767e+07 */
+
+  /* log2_lead and log2_tail sum to an extra-precise version
+     of log(2) */
+  static const double
+    log2_lead = 6.93147122859954833984e-01,  /* 0x3fe62e42e0000000 */
+    log2_tail = 5.76999904754328540596e-08;  /* 0x3e6efa39ef35793c */
+
+
+  GET_BITS_DP64(x, ux);
+  ax = ux & ~SIGNBIT_DP64;
+  xneg = ux & SIGNBIT_DP64;
+  PUT_BITS_DP64(ax, absx);
+
+  if ((ux & EXPBITS_DP64) == EXPBITS_DP64)
+    {
+      /* x is either NaN or infinity */
+      if (ux & MANTBITS_DP64)
+        {
+          /* x is NaN */
+#ifdef WINDOWS
+          return handle_error(_FUNCNAME, ux|0x0008000000000000, _DOMAIN,
+                              AMD_F_INVALID, EDOM, x, 0.0);
+#else
+          return x + x; /* Raise invalid if it is a signalling NaN */
+#endif
+        }
+      else
+        {
+          /* x is infinity. Return the same infinity. */
+#ifdef WINDOWS
+          if (ux & SIGNBIT_DP64)
+            return handle_error(_FUNCNAME, NINFBITPATT_DP64, _DOMAIN,
+                                AMD_F_INVALID, EDOM, x, 0.0);
+          else
+            return handle_error(_FUNCNAME, PINFBITPATT_DP64, _DOMAIN,
+                                AMD_F_INVALID, EDOM, x, 0.0);
+#else
+          return x;
+#endif
+        }
+    }
+  else if (ax < rteps) /* abs(x) < sqrt(epsilon) */
+    {
+      if (ax == 0x0000000000000000)
+        {
+          /* x is +/-zero. Return the same zero. */
+          return x;
+        }
+      else
+        {
+          /* Tiny arguments approximated by asinh(x) = x
+             - avoid slow operations on denormalized numbers */
+          return val_with_flags(x,AMD_F_INEXACT);
+        }
+    }
+
+
+  if (ax <= 0x3ff0000000000000) /* abs(x) <= 1.0 */
+    {
+      /* Arguments less than 1.0 in magnitude are
+         approximated by [4,4] or [5,4] minimax polynomials
+         fitted to asinh series 4.6.31 (x < 1) from Abramowitz and Stegun
+      */
+      t = x*x;
+      if (ax < 0x3fd0000000000000)
+        {
+          /* [4,4] for 0 < abs(x) < 0.25 */
+          poly =
+            (-0.12845379283524906084997e0 +
+             (-0.21060688498409799700819e0 +
+              (-0.10188951822578188309186e0 +
+               (-0.13891765817243625541799e-1 -
+                0.10324604871728082428024e-3 * t) * t) * t) * t) /
+            (0.77072275701149440164511e0 +
+             (0.16104665505597338100747e1 +
+              (0.11296034614816689554875e1 +
+               (0.30079351943799465092429e0 +
+                0.235224464765951442265117e-1 * t) * t) * t) * t);
+        }
+      else if (ax < 0x3fe0000000000000)
+        {
+          /* [4,4] for 0.25 <= abs(x) < 0.5 */
+          poly =
+            (-0.12186605129448852495563e0 +
+             (-0.19777978436593069928318e0 +
+              (-0.94379072395062374824320e-1 +
+               (-0.12620141363821680162036e-1 -
+                0.903396794842691998748349e-4 * t) * t) * t) * t) /
+            (0.73119630776696495279434e0 +
+             (0.15157170446881616648338e1 +
+              (0.10524909506981282725413e1 +
+               (0.27663713103600182193817e0 +
+                0.21263492900663656707646e-1 * t) * t) * t) * t);
+        }
+      else if (ax < 0x3fe8000000000000)
+        {
+          /* [4,4] for 0.5 <= abs(x) < 0.75 */
+          poly =
+            (-0.81210026327726247622500e-1 +
+             (-0.12327355080668808750232e0 +
+              (-0.53704925162784720405664e-1 +
+               (-0.63106739048128554465450e-2 -
+                0.35326896180771371053534e-4 * t) * t) * t) * t) /
+            (0.48726015805581794231182e0 +
+             (0.95890837357081041150936e0 +
+              (0.62322223426940387752480e0 +
+               (0.15028684818508081155141e0 +
+                0.10302171620320141529445e-1 * t) * t) * t) * t);
+        }
+      else
+        {
+          /* [5,4] for 0.75 <= abs(x) <= 1.0 */
+          poly =
+            (-0.4638179204422665073e-1 +
+             (-0.7162729496035415183e-1 +
+              (-0.3247795155696775148e-1 +
+               (-0.4225785421291932164e-2 +
+                (-0.3808984717603160127e-4 +
+                 0.8023464184964125826e-6 * t) * t) * t) * t) * t) /
+            (0.2782907534642231184e0 +
+             (0.5549945896829343308e0 +
+              (0.3700732511330698879e0 +
+               (0.9395783438240780722e-1 +
+                0.7200057974217143034e-2 * t) * t) * t) * t);
+        }
+      return x + x*t*poly;
+    }
+  else if (ax < 0x4040000000000000)
+    {
+      /* 1.0 <= abs(x) <= 32.0 */
+      /* Arguments in this region are approximated by various
+         minimax polynomials fitted to asinh series 4.6.31
+         in Abramowitz and Stegun.
+      */
+      t = x*x;
+      if (ax >= 0x4020000000000000)
+        {
+          /* [3,3] for 8.0 <= abs(x) <= 32.0 */
+          poly =
+            (-0.538003743384069117e-10 +
+             (-0.273698654196756169e-9 +
+              (-0.268129826956403568e-9 -
+               0.804163374628432850e-29 * t) * t) * t) /
+            (0.238083376363471960e-9 +
+             (0.203579344621125934e-8 +
+              (0.450836980450693209e-8 +
+               0.286005148753497156e-8 * t) * t) * t);
+        }
+      else if (ax >= 0x4010000000000000)
+        {
+          /* [4,3] for 4.0 <= abs(x) <= 8.0 */
+          poly =
+            (-0.178284193496441400e-6 +
+             (-0.928734186616614974e-6 +
+              (-0.923318925566302615e-6 +
+               (-0.776417026702577552e-19 +
+                0.290845644810826014e-21 * t) * t) * t) * t) /
+            (0.786694697277890964e-6 +
+             (0.685435665630965488e-5 +
+              (0.153780175436788329e-4 +
+               0.984873520613417917e-5 * t) * t) * t);
+
+        }
+      else if (ax >= 0x4000000000000000)
+        {
+          /* [5,4] for 2.0 <= abs(x) <= 4.0 */
+          poly =
+            (-0.209689451648100728e-6 +
+             (-0.219252358028695992e-5 +
+              (-0.551641756327550939e-5 +
+               (-0.382300259826830258e-5 +
+                (-0.421182121910667329e-17 +
+                 0.492236019998237684e-19 * t) * t) * t) * t) * t) /
+            (0.889178444424237735e-6 +
+             (0.131152171690011152e-4 +
+              (0.537955850185616847e-4 +
+               (0.814966175170941864e-4 +
+                0.407786943832260752e-4 * t) * t) * t) * t);
+        }
+      else if (ax >= 0x3ff8000000000000)
+        {
+          /* [5,4] for 1.5 <= abs(x) <= 2.0 */
+          poly =
+            (-0.195436610112717345e-4 +
+             (-0.233315515113382977e-3 +
+              (-0.645380957611087587e-3 +
+               (-0.478948863920281252e-3 +
+                (-0.805234112224091742e-12 +
+                 0.246428598194879283e-13 * t) * t) * t) * t) * t) /
+            (0.822166621698664729e-4 +
+             (0.135346265620413852e-2 +
+              (0.602739242861830658e-2 +
+               (0.972227795510722956e-2 +
+                0.510878800983771167e-2 * t) * t) * t) * t);
+        }
+      else
+        {
+          /* [5,5] for 1.0 <= abs(x) <= 1.5 */
+          poly =
+            (-0.121224194072430701e-4 +
+             (-0.273145455834305218e-3 +
+              (-0.152866982560895737e-2 +
+               (-0.292231744584913045e-2 +
+                (-0.174670900236060220e-2 -
+                 0.891754209521081538e-12 * t) * t) * t) * t) * t) /
+            (0.499426632161317606e-4 +
+             (0.139591210395547054e-2 +
+              (0.107665231109108629e-1 +
+               (0.325809818749873406e-1 +
+                (0.415222526655158363e-1 +
+                 0.186315628774716763e-1 * t) * t) * t) * t) * t);
+        }
+      log_kernel_amd64(absx, ax, &xexp, &r1, &r2);
+      r1 = ((xexp+1) * log2_lead + r1);
+      r2 = ((xexp+1) * log2_tail + r2);
+      /* Now (r1,r2) sum to log(2x). Add the term
+         1/(2.2.x^2) = 0.25/t, and add poly/t, carefully
+         to maintain precision. (Note that we add poly/t
+         rather than poly because of the *x factor used
+         when generating the minimax polynomial) */
+      v2 = (poly+0.25)/t;
+      r = v2 + r1;
+      s = ((r1 - r) + v2) + r2;
+      v1 = r + s;
+      v2 = (r - v1) + s;
+      r = v1 + v2;
+      if (xneg)
+        return -r;
+      else
+        return r;
+    }
+  else
+    {
+      /* abs(x) > 32.0 */
+      if (ax > recrteps)
+        {
+          /* Arguments greater than 1/sqrt(epsilon) in magnitude are
+             approximated by asinh(x) = ln(2) + ln(abs(x)), with sign of x */
+          /* log_kernel_amd(x) returns xexp, r1, r2 such that
+             log(x) = xexp*log(2) + r1 + r2 */
+          log_kernel_amd64(absx, ax, &xexp, &r1, &r2);
+          /* Add (xexp+1) * log(2) to z1,z2 to get the result asinh(x).
+             The computed r1 is not subject to rounding error because
+             (xexp+1) has at most 10 significant bits, log(2) has 24 significant
+             bits, and r1 has up to 24 bits; and the exponents of r1
+             and r2 differ by at most 6. */
+          r1 = ((xexp+1) * log2_lead + r1);
+          r2 = ((xexp+1) * log2_tail + r2);
+          if (xneg)
+            return -(r1 + r2);
+          else
+            return r1 + r2;
+        }
+      else
+        {
+          rarg = absx*absx+1.0;
+          /* Arguments such that 32.0 <= abs(x) <= 1/sqrt(epsilon) are
+             approximated by
+               asinh(x) = ln(abs(x) + sqrt(x*x+1))
+             with the sign of x (see Abramowitz and Stegun 4.6.20) */
+          /* Use assembly instruction to compute r = sqrt(rarg); */
+          ASMSQRT(rarg,r);
+          r += absx;
+          GET_BITS_DP64(r, ax);
+          log_kernel_amd64(r, ax, &xexp, &r1, &r2);
+          r1 = (xexp * log2_lead + r1);
+          r2 = (xexp * log2_tail + r2);
+          if (xneg)
+            return -(r1 + r2);
+          else
+            return r1 + r2;
+        }
+    }
+}
+
+weak_alias (__asinh, asinh)
diff --git a/src/asinhf.c b/src/asinhf.c
new file mode 100644
index 0000000..f5d3bf9
--- /dev/null
+++ b/src/asinhf.c
@@ -0,0 +1,164 @@
+
+/*
+*  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+*
+*  This file is part of libacml_mv.
+*
+*  libacml_mv is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  libacml_mv is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with libacml_mv.  If not, see
+*  <http://www.gnu.org/licenses/>.
+*
+*/
+
+
+#include "../inc/libm_amd.h"
+#include "../inc/libm_util_amd.h"
+
+#include <stdio.h>
+
+#define USE_HANDLE_ERRORF
+#define USE_VALF_WITH_FLAGS
+#include "../inc/libm_inlines_amd.h"
+#undef USE_HANDLE_ERRORF
+#undef VALF_WITH_FLAGS
+
+#undef _FUNCNAME
+#define _FUNCNAME "asinhf"
+float FN_PROTOTYPE(asinhf)(float x)
+{
+
+  double dx;
+  unsigned int ux, ax, xneg;
+  double absx, r, rarg, t, poly;
+
+  static const unsigned int
+    rteps = 0x39800000,    /* sqrt(eps) = 2.44140625000000000000e-04 */
+    recrteps = 0x46000000; /* 1/rteps = 4.09600000000000000000e+03 */
+
+  static const double
+    log2 = 6.93147180559945286227e-01;  /* 0x3fe62e42fefa39ef */
+
+  GET_BITS_SP32(x, ux);
+  ax = ux & ~SIGNBIT_SP32;
+  xneg = ux & SIGNBIT_SP32;
+
+  if ((ux & EXPBITS_SP32) == EXPBITS_SP32)
+    {
+      /* x is either NaN or infinity */
+      if (ux & MANTBITS_SP32)
+        {
+          /* x is NaN */
+#ifdef WINDOWS
+          return handle_errorf(_FUNCNAME, ux|0x00400000, _DOMAIN,
+                               0, EDOM, x, 0.0F);
+#else
+          return x + x; /* Raise invalid if it is a signalling NaN */
+#endif
+        }
+      else
+        {
+          /* x is infinity. Return the same infinity. */
+#ifdef WINDOWS
+          if (ux & SIGNBIT_SP32)
+            return handle_errorf(_FUNCNAME, NINFBITPATT_SP32, _DOMAIN,
+                                 AMD_F_INVALID, EDOM, x, 0.0F);
+          else
+            return handle_errorf(_FUNCNAME, PINFBITPATT_SP32, _DOMAIN,
+                                 AMD_F_INVALID, EDOM, x, 0.0F);
+#else
+          return x;
+#endif
+        }
+    }
+  else if (ax < rteps) /* abs(x) < sqrt(epsilon) */
+    {
+      if (ax == 0x00000000)
+        {
+          /* x is +/-zero. Return the same zero. */
+          return x;
+        }
+      else
+        {
+          /* Tiny arguments approximated by asinhf(x) = x
+             - avoid slow operations on denormalized numbers */
+          return valf_with_flags(x,AMD_F_INEXACT);
+        }
+    }
+
+  dx = x;
+  if (xneg)
+    absx = -dx;
+  else
+    absx = dx;
+
+  if (ax <= 0x40800000) /* abs(x) <= 4.0 */
+    {
+      /* Arguments less than 4.0 in magnitude are
+         approximated by [4,4] minimax polynomials
+      */
+      t = dx*dx;
+      if (ax <= 0x40000000) /* abs(x) <= 2 */
+        poly =
+          (-0.1152965835871758072e-1 +
+          (-0.1480204186473758321e-1 +
+          (-0.5063201055468483248e-2 +
+          (-0.4162727710583425360e-3 -
+            0.1177198915954942694e-5 * t) * t) * t) * t) /
+           (0.6917795026025976739e-1 +
+           (0.1199423176003939087e+0 +
+           (0.6582362487198468066e-1 +
+           (0.1260024978680227945e-1 +
+            0.6284381367285534560e-3 * t) * t) * t) * t);
+      else
+        poly =
+           (-0.185462290695578589e-2 +
+           (-0.113672533502734019e-2 +
+           (-0.142208387300570402e-3 +
+           (-0.339546014993079977e-5 -
+             0.151054665394480990e-8 * t) * t) * t) * t) /
+            (0.111486158580024771e-1 +
+            (0.117782437980439561e-1 +
+            (0.325903773532674833e-2 +
+            (0.255902049924065424e-3 +
+             0.434150786948890837e-5 * t) * t) * t) * t);
+      return (float)(dx + dx*t*poly);
+    }
+  else
+    {
+      /* abs(x) > 4.0 */
+      if (ax > recrteps)
+        {
+          /* Arguments greater than 1/sqrt(epsilon) in magnitude are
+             approximated by asinhf(x) = ln(2) + ln(abs(x)), with sign of x */
+          r = FN_PROTOTYPE(log)(absx) + log2;
+        }
+      else
+        {
+          rarg = absx*absx+1.0;
+          /* Arguments such that 4.0 <= abs(x) <= 1/sqrt(epsilon) are
+             approximated by
+               asinhf(x) = ln(abs(x) + sqrt(x*x+1))
+             with the sign of x (see Abramowitz and Stegun 4.6.20) */
+          /* Use assembly instruction to compute r = sqrt(rarg); */
+          ASMSQRT(rarg,r);
+          r += absx;
+          r = FN_PROTOTYPE(log)(r);
+        }
+      if (xneg)
+        return (float)(-r);
+      else
+        return (float)r;
+    }
+}
+
+weak_alias (__asinhf, asinhf)
diff --git a/src/atan.c b/src/atan.c
new file mode 100644
index 0000000..3b99df9
--- /dev/null
+++ b/src/atan.c
@@ -0,0 +1,171 @@
+
+/*
+*  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+*
+*  This file is part of libacml_mv.
+*
+*  libacml_mv is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  libacml_mv is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with libacml_mv.  If not, see
+*  <http://www.gnu.org/licenses/>.
+*
+*/
+
+
+#include "../inc/libm_amd.h"
+#include "../inc/libm_util_amd.h"
+
+#define USE_VAL_WITH_FLAGS
+#define USE_NAN_WITH_FLAGS
+#define USE_HANDLE_ERROR
+#include "../inc/libm_inlines_amd.h"
+#undef USE_VAL_WITH_FLAGS
+#undef USE_NAN_WITH_FLAGS
+#undef USE_HANDLE_ERROR
+
+#include "../inc/libm_errno_amd.h"
+
+#ifndef WINDOWS
+/* Deal with errno for out-of-range argument */
+static inline double retval_errno_edom(double x)
+{
+  struct exception exc;
+  exc.arg1 = x;
+  exc.arg2 = x;
+  exc.name = (char *)"atan";
+  exc.type = DOMAIN;
+  if (_LIB_VERSION == _SVID_)
+    exc.retval = HUGE;
+  else
+    exc.retval = nan_with_flags(AMD_F_INVALID);
+  if (_LIB_VERSION == _POSIX_)
+    __set_errno(EDOM);
+  else if (!matherr(&exc))
+    {
+      if(_LIB_VERSION == _SVID_)
+        (void)fputs("atan: DOMAIN error\n", stderr);
+    __set_errno(EDOM);
+    }
+  return exc.retval;
+}
+#endif
+
+#ifdef WINDOWS
+#pragma function(atan)
+#endif
+
+double FN_PROTOTYPE(atan)(double x)
+{
+
+  /* Some constants and split constants. */
+
+  static double piby2 = 1.5707963267948966e+00; /* 0x3ff921fb54442d18 */
+  double chi, clo, v, s, q, z;
+
+  /* Find properties of argument x. */
+
+  unsigned long long ux, aux, xneg;
+  GET_BITS_DP64(x, ux);
+  aux = ux & ~SIGNBIT_DP64;
+  xneg = (ux != aux);
+
+  if (xneg) v = -x;
+  else v = x;
+
+  /* Argument reduction to range [-7/16,7/16] */
+
+  if (aux < 0x3e50000000000000) /* v < 2.0^(-26) */
+    {
+      /* x is a good approximation to atan(x) and avoids working on
+         intermediate denormal numbers */
+      if (aux == 0x0000000000000000)
+        return x;
+      else
+        return val_with_flags(x, AMD_F_INEXACT);
+    }
+  else if (aux > 0x4003800000000000) /* v > 39./16. */
+    {
+
+      if (aux > PINFBITPATT_DP64)
+        {
+          /* x is NaN */
+#ifdef WINDOWS
+          return handle_error("atan", ux|0x0008000000000000, _DOMAIN, 0,
+                              EDOM, x, 0.0);
+#else
+          return x + x; /* Raise invalid if it's a signalling NaN */
+#endif
+        }
+      else if (aux > 0x4370000000000000)
+	{ /* abs(x) > 2^56 => arctan(1/x) is
+	     insignificant compared to piby2 */
+	  if (xneg)
+            return val_with_flags(-piby2, AMD_F_INEXACT);
+	  else
+            return val_with_flags(piby2, AMD_F_INEXACT);
+	}
+
+      x = -1.0/v;
+      /* (chi + clo) = arctan(infinity) */
+      chi = 1.57079632679489655800e+00; /* 0x3ff921fb54442d18 */
+      clo = 6.12323399573676480327e-17; /* 0x3c91a62633145c06 */
+    }
+  else if (aux > 0x3ff3000000000000) /* 39./16. > v > 19./16. */
+    {
+      x = (v-1.5)/(1.0+1.5*v);
+      /* (chi + clo) = arctan(1.5) */
+      chi = 9.82793723247329054082e-01; /* 0x3fef730bd281f69b */
+      clo = 1.39033110312309953701e-17; /* 0x3c7007887af0cbbc */
+    }
+  else if (aux > 0x3fe6000000000000) /* 19./16. > v > 11./16. */
+    {
+      x = (v-1.0)/(1.0+v);
+      /* (chi + clo) = arctan(1.) */
+      chi = 7.85398163397448278999e-01; /* 0x3fe921fb54442d18 */
+      clo = 3.06161699786838240164e-17; /* 0x3c81a62633145c06 */
+    }
+  else if (aux > 0x3fdc000000000000) /* 11./16. > v > 7./16. */
+    {
+      x = (2.0*v-1.0)/(2.0+v);
+      /* (chi + clo) = arctan(0.5) */
+      chi = 4.63647609000806093515e-01; /* 0x3fddac670561bb4f */
+      clo = 2.26987774529616809294e-17; /* 0x3c7a2b7f222f65e0 */
+    }
+  else  /* v < 7./16. */
+    {
+      x = v;
+      chi = 0.0;
+      clo = 0.0;
+    }
+
+  /* Core approximation: Remez(4,4) on [-7/16,7/16] */
+
+  s = x*x;
+  q = x*s*
+       (0.268297920532545909e0 +
+	(0.447677206805497472e0 +
+	 (0.220638780716667420e0 +
+	  (0.304455919504853031e-1 +
+	    0.142316903342317766e-3*s)*s)*s)*s)/
+       (0.804893761597637733e0 +
+	(0.182596787737507063e1 +
+	 (0.141254259931958921e1 +
+	  (0.424602594203847109e0 +
+	    0.389525873944742195e-1*s)*s)*s)*s);
+
+  z = chi - ((q - clo) - x);
+
+  if (xneg) z = -z;
+  return z;
+}
+
+weak_alias (__atan, atan)
diff --git a/src/atan2.c b/src/atan2.c
new file mode 100644
index 0000000..6531ee4
--- /dev/null
+++ b/src/atan2.c
@@ -0,0 +1,785 @@
+
+/*
+*  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+*
+*  This file is part of libacml_mv.
+*
+*  libacml_mv is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  libacml_mv is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with libacml_mv.  If not, see
+*  <http://www.gnu.org/licenses/>.
+*
+*/
+
+
+#include "../inc/libm_amd.h"
+#include "../inc/libm_util_amd.h"
+
+#define USE_VAL_WITH_FLAGS
+#define USE_NAN_WITH_FLAGS
+#define USE_SCALEDOUBLE_1
+#define USE_SCALEDOUBLE_2
+#define USE_SCALEUPDOUBLE1024
+#define USE_SCALEDOWNDOUBLE
+#define USE_HANDLE_ERROR
+#include "../inc/libm_inlines_amd.h"
+#undef USE_VAL_WITH_FLAGS
+#undef USE_NAN_WITH_FLAGS
+#undef USE_SCALEDOUBLE_1
+#undef USE_SCALEDOUBLE_2
+#undef USE_SCALEUPDOUBLE1024
+#undef USE_SCALEDOWNDOUBLE
+#undef USE_HANDLE_ERROR
+
+#include "../inc/libm_errno_amd.h"
+
+#ifndef WINDOWS
+/* Deal with errno for out-of-range arguments
+   (only used when _LIB_VERSION is _SVID_) */
+static inline double retval_errno_edom(double x, double y)
+{
+  struct exception exc;
+  exc.arg1 = x;
+  exc.arg2 = y;
+  exc.name = (char *)"atan2";
+  exc.type = DOMAIN;
+  exc.retval = HUGE;
+  if (!matherr(&exc))
+    {
+      (void)fputs("atan2: DOMAIN error\n", stderr);
+      __set_errno(EDOM);
+    }
+  return exc.retval;
+}
+#endif
+
+#ifdef WINDOWS
+#pragma function(atan2)
+#endif
+
+double FN_PROTOTYPE(atan2)(double y, double x)
+{
+  /* Arrays atan_jby256_lead and atan_jby256_tail contain
+     leading and trailing parts respectively of precomputed
+     values of atan(j/256), for j = 16, 17, ..., 256.
+     atan_jby256_lead contains the first 21 bits of precision,
+     and atan_jby256_tail contains a further 53 bits precision. */
+
+  static const double atan_jby256_lead[  241] = {
+    6.24187886714935302734e-02,  /* 0x3faff55b00000000 */
+    6.63088560104370117188e-02,  /* 0x3fb0f99e00000000 */
+    7.01969265937805175781e-02,  /* 0x3fb1f86d00000000 */
+    7.40829110145568847656e-02,  /* 0x3fb2f71900000000 */
+    7.79666304588317871094e-02,  /* 0x3fb3f59f00000000 */
+    8.18479657173156738281e-02,  /* 0x3fb4f3fd00000000 */
+    8.57268571853637695312e-02,  /* 0x3fb5f23200000000 */
+    8.96031260490417480469e-02,  /* 0x3fb6f03b00000000 */
+    9.34767723083496093750e-02,  /* 0x3fb7ee1800000000 */
+    9.73475575447082519531e-02,  /* 0x3fb8ebc500000000 */
+    1.01215422153472900391e-01,  /* 0x3fb9e94100000000 */
+    1.05080246925354003906e-01,  /* 0x3fbae68a00000000 */
+    1.08941912651062011719e-01,  /* 0x3fbbe39e00000000 */
+    1.12800359725952148438e-01,  /* 0x3fbce07c00000000 */
+    1.16655409336090087891e-01,  /* 0x3fbddd2100000000 */
+    1.20507001876831054688e-01,  /* 0x3fbed98c00000000 */
+    1.24354958534240722656e-01,  /* 0x3fbfd5ba00000000 */
+    1.28199219703674316406e-01,  /* 0x3fc068d500000000 */
+    1.32039666175842285156e-01,  /* 0x3fc0e6ad00000000 */
+    1.35876297950744628906e-01,  /* 0x3fc1646500000000 */
+    1.39708757400512695312e-01,  /* 0x3fc1e1fa00000000 */
+    1.43537282943725585938e-01,  /* 0x3fc25f6e00000000 */
+    1.47361397743225097656e-01,  /* 0x3fc2dcbd00000000 */
+    1.51181221008300781250e-01,  /* 0x3fc359e800000000 */
+    1.54996633529663085938e-01,  /* 0x3fc3d6ee00000000 */
+    1.58807516098022460938e-01,  /* 0x3fc453ce00000000 */
+    1.62613749504089355469e-01,  /* 0x3fc4d08700000000 */
+    1.66415214538574218750e-01,  /* 0x3fc54d1800000000 */
+    1.70211911201477050781e-01,  /* 0x3fc5c98100000000 */
+    1.74003481864929199219e-01,  /* 0x3fc645bf00000000 */
+    1.77790164947509765625e-01,  /* 0x3fc6c1d400000000 */
+    1.81571602821350097656e-01,  /* 0x3fc73dbd00000000 */
+    1.85347914695739746094e-01,  /* 0x3fc7b97b00000000 */
+    1.89118742942810058594e-01,  /* 0x3fc8350b00000000 */
+    1.92884206771850585938e-01,  /* 0x3fc8b06e00000000 */
+    1.96644186973571777344e-01,  /* 0x3fc92ba300000000 */
+    2.00398445129394531250e-01,  /* 0x3fc9a6a800000000 */
+    2.04147100448608398438e-01,  /* 0x3fca217e00000000 */
+    2.07889914512634277344e-01,  /* 0x3fca9c2300000000 */
+    2.11626768112182617188e-01,  /* 0x3fcb169600000000 */
+    2.15357661247253417969e-01,  /* 0x3fcb90d700000000 */
+    2.19082474708557128906e-01,  /* 0x3fcc0ae500000000 */
+    2.22801089286804199219e-01,  /* 0x3fcc84bf00000000 */
+    2.26513504981994628906e-01,  /* 0x3fccfe6500000000 */
+    2.30219483375549316406e-01,  /* 0x3fcd77d500000000 */
+    2.33919143676757812500e-01,  /* 0x3fcdf11000000000 */
+    2.37612247467041015625e-01,  /* 0x3fce6a1400000000 */
+    2.41298794746398925781e-01,  /* 0x3fcee2e100000000 */
+    2.44978547096252441406e-01,  /* 0x3fcf5b7500000000 */
+    2.48651623725891113281e-01,  /* 0x3fcfd3d100000000 */
+    2.52317905426025390625e-01,  /* 0x3fd025fa00000000 */
+    2.55977153778076171875e-01,  /* 0x3fd061ee00000000 */
+    2.59629487991333007812e-01,  /* 0x3fd09dc500000000 */
+    2.63274669647216796875e-01,  /* 0x3fd0d97e00000000 */
+    2.66912937164306640625e-01,  /* 0x3fd1151a00000000 */
+    2.70543813705444335938e-01,  /* 0x3fd1509700000000 */
+    2.74167299270629882812e-01,  /* 0x3fd18bf500000000 */
+    2.77783632278442382812e-01,  /* 0x3fd1c73500000000 */
+    2.81392335891723632812e-01,  /* 0x3fd2025500000000 */
+    2.84993648529052734375e-01,  /* 0x3fd23d5600000000 */
+    2.88587331771850585938e-01,  /* 0x3fd2783700000000 */
+    2.92173147201538085938e-01,  /* 0x3fd2b2f700000000 */
+    2.95751571655273437500e-01,  /* 0x3fd2ed9800000000 */
+    2.99322128295898437500e-01,  /* 0x3fd3281800000000 */
+    3.02884817123413085938e-01,  /* 0x3fd3627700000000 */
+    3.06439399719238281250e-01,  /* 0x3fd39cb400000000 */
+    3.09986352920532226562e-01,  /* 0x3fd3d6d100000000 */
+    3.13524961471557617188e-01,  /* 0x3fd410cb00000000 */
+    3.17055702209472656250e-01,  /* 0x3fd44aa400000000 */
+    3.20578098297119140625e-01,  /* 0x3fd4845a00000000 */
+    3.24092388153076171875e-01,  /* 0x3fd4bdee00000000 */
+    3.27598333358764648438e-01,  /* 0x3fd4f75f00000000 */
+    3.31095933914184570312e-01,  /* 0x3fd530ad00000000 */
+    3.34585189819335937500e-01,  /* 0x3fd569d800000000 */
+    3.38066101074218750000e-01,  /* 0x3fd5a2e000000000 */
+    3.41538190841674804688e-01,  /* 0x3fd5dbc300000000 */
+    3.45002174377441406250e-01,  /* 0x3fd6148400000000 */
+    3.48457098007202148438e-01,  /* 0x3fd64d1f00000000 */
+    3.51903676986694335938e-01,  /* 0x3fd6859700000000 */
+    3.55341434478759765625e-01,  /* 0x3fd6bdea00000000 */
+    3.58770608901977539062e-01,  /* 0x3fd6f61900000000 */
+    3.62190723419189453125e-01,  /* 0x3fd72e2200000000 */
+    3.65602254867553710938e-01,  /* 0x3fd7660700000000 */
+    3.69004726409912109375e-01,  /* 0x3fd79dc600000000 */
+    3.72398376464843750000e-01,  /* 0x3fd7d56000000000 */
+    3.75782966613769531250e-01,  /* 0x3fd80cd400000000 */
+    3.79158496856689453125e-01,  /* 0x3fd8442200000000 */
+    3.82525205612182617188e-01,  /* 0x3fd87b4b00000000 */
+    3.85882616043090820312e-01,  /* 0x3fd8b24d00000000 */
+    3.89230966567993164062e-01,  /* 0x3fd8e92900000000 */
+    3.92570018768310546875e-01,  /* 0x3fd91fde00000000 */
+    3.95900011062622070312e-01,  /* 0x3fd9566d00000000 */
+    3.99220705032348632812e-01,  /* 0x3fd98cd500000000 */
+    4.02532100677490234375e-01,  /* 0x3fd9c31600000000 */
+    4.05834197998046875000e-01,  /* 0x3fd9f93000000000 */
+    4.09126996994018554688e-01,  /* 0x3fda2f2300000000 */
+    4.12410259246826171875e-01,  /* 0x3fda64ee00000000 */
+    4.15684223175048828125e-01,  /* 0x3fda9a9200000000 */
+    4.18948888778686523438e-01,  /* 0x3fdad00f00000000 */
+    4.22204017639160156250e-01,  /* 0x3fdb056400000000 */
+    4.25449609756469726562e-01,  /* 0x3fdb3a9100000000 */
+    4.28685665130615234375e-01,  /* 0x3fdb6f9600000000 */
+    4.31912183761596679688e-01,  /* 0x3fdba47300000000 */
+    4.35129165649414062500e-01,  /* 0x3fdbd92800000000 */
+    4.38336372375488281250e-01,  /* 0x3fdc0db400000000 */
+    4.41534280776977539062e-01,  /* 0x3fdc421900000000 */
+    4.44722414016723632812e-01,  /* 0x3fdc765500000000 */
+    4.47900772094726562500e-01,  /* 0x3fdcaa6800000000 */
+    4.51069593429565429688e-01,  /* 0x3fdcde5300000000 */
+    4.54228639602661132812e-01,  /* 0x3fdd121500000000 */
+    4.57377910614013671875e-01,  /* 0x3fdd45ae00000000 */
+    4.60517644882202148438e-01,  /* 0x3fdd791f00000000 */
+    4.63647603988647460938e-01,  /* 0x3fddac6700000000 */
+    4.66767549514770507812e-01,  /* 0x3fdddf8500000000 */
+    4.69877958297729492188e-01,  /* 0x3fde127b00000000 */
+    4.72978591918945312500e-01,  /* 0x3fde454800000000 */
+    4.76069211959838867188e-01,  /* 0x3fde77eb00000000 */
+    4.79150056838989257812e-01,  /* 0x3fdeaa6500000000 */
+    4.82221126556396484375e-01,  /* 0x3fdedcb600000000 */
+    4.85282421112060546875e-01,  /* 0x3fdf0ede00000000 */
+    4.88333940505981445312e-01,  /* 0x3fdf40dd00000000 */
+    4.91375446319580078125e-01,  /* 0x3fdf72b200000000 */
+    4.94406938552856445312e-01,  /* 0x3fdfa45d00000000 */
+    4.97428894042968750000e-01,  /* 0x3fdfd5e000000000 */
+    5.00440597534179687500e-01,  /* 0x3fe0039c00000000 */
+    5.03442764282226562500e-01,  /* 0x3fe01c3400000000 */
+    5.06434917449951171875e-01,  /* 0x3fe034b700000000 */
+    5.09417057037353515625e-01,  /* 0x3fe04d2500000000 */
+    5.12389183044433593750e-01,  /* 0x3fe0657e00000000 */
+    5.15351772308349609375e-01,  /* 0x3fe07dc300000000 */
+    5.18304347991943359375e-01,  /* 0x3fe095f300000000 */
+    5.21246910095214843750e-01,  /* 0x3fe0ae0e00000000 */
+    5.24179458618164062500e-01,  /* 0x3fe0c61400000000 */
+    5.27101993560791015625e-01,  /* 0x3fe0de0500000000 */
+    5.30014991760253906250e-01,  /* 0x3fe0f5e200000000 */
+    5.32917976379394531250e-01,  /* 0x3fe10daa00000000 */
+    5.35810947418212890625e-01,  /* 0x3fe1255d00000000 */
+    5.38693904876708984375e-01,  /* 0x3fe13cfb00000000 */
+    5.41567325592041015625e-01,  /* 0x3fe1548500000000 */
+    5.44430732727050781250e-01,  /* 0x3fe16bfa00000000 */
+    5.47284126281738281250e-01,  /* 0x3fe1835a00000000 */
+    5.50127506256103515625e-01,  /* 0x3fe19aa500000000 */
+    5.52961349487304687500e-01,  /* 0x3fe1b1dc00000000 */
+    5.55785179138183593750e-01,  /* 0x3fe1c8fe00000000 */
+    5.58598995208740234375e-01,  /* 0x3fe1e00b00000000 */
+    5.61403274536132812500e-01,  /* 0x3fe1f70400000000 */
+    5.64197540283203125000e-01,  /* 0x3fe20de800000000 */
+    5.66981792449951171875e-01,  /* 0x3fe224b700000000 */
+    5.69756031036376953125e-01,  /* 0x3fe23b7100000000 */
+    5.72520732879638671875e-01,  /* 0x3fe2521700000000 */
+    5.75275897979736328125e-01,  /* 0x3fe268a900000000 */
+    5.78021049499511718750e-01,  /* 0x3fe27f2600000000 */
+    5.80756187438964843750e-01,  /* 0x3fe2958e00000000 */
+    5.83481788635253906250e-01,  /* 0x3fe2abe200000000 */
+    5.86197376251220703125e-01,  /* 0x3fe2c22100000000 */
+    5.88903427124023437500e-01,  /* 0x3fe2d84c00000000 */
+    5.91599464416503906250e-01,  /* 0x3fe2ee6200000000 */
+    5.94285964965820312500e-01,  /* 0x3fe3046400000000 */
+    5.96962928771972656250e-01,  /* 0x3fe31a5200000000 */
+    5.99629878997802734375e-01,  /* 0x3fe3302b00000000 */
+    6.02287292480468750000e-01,  /* 0x3fe345f000000000 */
+    6.04934692382812500000e-01,  /* 0x3fe35ba000000000 */
+    6.07573032379150390625e-01,  /* 0x3fe3713d00000000 */
+    6.10201358795166015625e-01,  /* 0x3fe386c500000000 */
+    6.12820148468017578125e-01,  /* 0x3fe39c3900000000 */
+    6.15428924560546875000e-01,  /* 0x3fe3b19800000000 */
+    6.18028640747070312500e-01,  /* 0x3fe3c6e400000000 */
+    6.20618820190429687500e-01,  /* 0x3fe3dc1c00000000 */
+    6.23198986053466796875e-01,  /* 0x3fe3f13f00000000 */
+    6.25770092010498046875e-01,  /* 0x3fe4064f00000000 */
+    6.28331184387207031250e-01,  /* 0x3fe41b4a00000000 */
+    6.30883216857910156250e-01,  /* 0x3fe4303200000000 */
+    6.33425712585449218750e-01,  /* 0x3fe4450600000000 */
+    6.35958671569824218750e-01,  /* 0x3fe459c600000000 */
+    6.38482093811035156250e-01,  /* 0x3fe46e7200000000 */
+    6.40995979309082031250e-01,  /* 0x3fe4830a00000000 */
+    6.43500804901123046875e-01,  /* 0x3fe4978f00000000 */
+    6.45996093750000000000e-01,  /* 0x3fe4ac0000000000 */
+    6.48482322692871093750e-01,  /* 0x3fe4c05e00000000 */
+    6.50959014892578125000e-01,  /* 0x3fe4d4a800000000 */
+    6.53426170349121093750e-01,  /* 0x3fe4e8de00000000 */
+    6.55884265899658203125e-01,  /* 0x3fe4fd0100000000 */
+    6.58332824707031250000e-01,  /* 0x3fe5111000000000 */
+    6.60772323608398437500e-01,  /* 0x3fe5250c00000000 */
+    6.63202762603759765625e-01,  /* 0x3fe538f500000000 */
+    6.65623664855957031250e-01,  /* 0x3fe54cca00000000 */
+    6.68035984039306640625e-01,  /* 0x3fe5608d00000000 */
+    6.70438766479492187500e-01,  /* 0x3fe5743c00000000 */
+    6.72832489013671875000e-01,  /* 0x3fe587d800000000 */
+    6.75216674804687500000e-01,  /* 0x3fe59b6000000000 */
+    6.77592277526855468750e-01,  /* 0x3fe5aed600000000 */
+    6.79958820343017578125e-01,  /* 0x3fe5c23900000000 */
+    6.82316303253173828125e-01,  /* 0x3fe5d58900000000 */
+    6.84664726257324218750e-01,  /* 0x3fe5e8c600000000 */
+    6.87004089355468750000e-01,  /* 0x3fe5fbf000000000 */
+    6.89334869384765625000e-01,  /* 0x3fe60f0800000000 */
+    6.91656589508056640625e-01,  /* 0x3fe6220d00000000 */
+    6.93969249725341796875e-01,  /* 0x3fe634ff00000000 */
+    6.96272850036621093750e-01,  /* 0x3fe647de00000000 */
+    6.98567867279052734375e-01,  /* 0x3fe65aab00000000 */
+    7.00854301452636718750e-01,  /* 0x3fe66d6600000000 */
+    7.03131675720214843750e-01,  /* 0x3fe6800e00000000 */
+    7.05400466918945312500e-01,  /* 0x3fe692a400000000 */
+    7.07660198211669921875e-01,  /* 0x3fe6a52700000000 */
+    7.09911346435546875000e-01,  /* 0x3fe6b79800000000 */
+    7.12153911590576171875e-01,  /* 0x3fe6c9f700000000 */
+    7.14387893676757812500e-01,  /* 0x3fe6dc4400000000 */
+    7.16613292694091796875e-01,  /* 0x3fe6ee7f00000000 */
+    7.18829631805419921875e-01,  /* 0x3fe700a700000000 */
+    7.21037864685058593750e-01,  /* 0x3fe712be00000000 */
+    7.23237514495849609375e-01,  /* 0x3fe724c300000000 */
+    7.25428581237792968750e-01,  /* 0x3fe736b600000000 */
+    7.27611064910888671875e-01,  /* 0x3fe7489700000000 */
+    7.29785442352294921875e-01,  /* 0x3fe75a6700000000 */
+    7.31950759887695312500e-01,  /* 0x3fe76c2400000000 */
+    7.34108448028564453125e-01,  /* 0x3fe77dd100000000 */
+    7.36257076263427734375e-01,  /* 0x3fe78f6b00000000 */
+    7.38397598266601562500e-01,  /* 0x3fe7a0f400000000 */
+    7.40530014038085937500e-01,  /* 0x3fe7b26c00000000 */
+    7.42654323577880859375e-01,  /* 0x3fe7c3d300000000 */
+    7.44770050048828125000e-01,  /* 0x3fe7d52800000000 */
+    7.46877670288085937500e-01,  /* 0x3fe7e66c00000000 */
+    7.48976707458496093750e-01,  /* 0x3fe7f79e00000000 */
+    7.51068115234375000000e-01,  /* 0x3fe808c000000000 */
+    7.53150939941406250000e-01,  /* 0x3fe819d000000000 */
+    7.55226135253906250000e-01,  /* 0x3fe82ad000000000 */
+    7.57292747497558593750e-01,  /* 0x3fe83bbe00000000 */
+    7.59351730346679687500e-01,  /* 0x3fe84c9c00000000 */
+    7.61402606964111328125e-01,  /* 0x3fe85d6900000000 */
+    7.63445377349853515625e-01,  /* 0x3fe86e2500000000 */
+    7.65480041503906250000e-01,  /* 0x3fe87ed000000000 */
+    7.67507076263427734375e-01,  /* 0x3fe88f6b00000000 */
+    7.69526004791259765625e-01,  /* 0x3fe89ff500000000 */
+    7.71537303924560546875e-01,  /* 0x3fe8b06f00000000 */
+    7.73540973663330078125e-01,  /* 0x3fe8c0d900000000 */
+    7.75536537170410156250e-01,  /* 0x3fe8d13200000000 */
+    7.77523994445800781250e-01,  /* 0x3fe8e17a00000000 */
+    7.79504299163818359375e-01,  /* 0x3fe8f1b300000000 */
+    7.81476497650146484375e-01,  /* 0x3fe901db00000000 */
+    7.83441066741943359375e-01,  /* 0x3fe911f300000000 */
+    7.85398006439208984375e-01}; /* 0x3fe921fb00000000 */
+
+  static const double atan_jby256_tail[  241] = {
+    2.13244638182005395671e-08,  /* 0x3e56e59fbd38db2c */
+    3.89093864761712760656e-08,  /* 0x3e64e3aa54dedf96 */
+    4.44780900009437454576e-08,  /* 0x3e67e105ab1bda88 */
+    1.15344768460112754160e-08,  /* 0x3e48c5254d013fd0 */
+    3.37271051945395312705e-09,  /* 0x3e2cf8ab3ad62670 */
+    2.40857608736109859459e-08,  /* 0x3e59dca4bec80468 */
+    1.85853810450623807768e-08,  /* 0x3e53f4b5ec98a8da */
+    5.14358299969225078306e-08,  /* 0x3e6b9d49619d81fe */
+    8.85023985412952486748e-09,  /* 0x3e43017887460934 */
+    1.59425154214358432060e-08,  /* 0x3e511e3eca0b9944 */
+    1.95139937737755753164e-08,  /* 0x3e54f3f73c5a332e */
+    2.64909755273544319715e-08,  /* 0x3e5c71c8ae0e00a6 */
+    4.43388037881231070144e-08,  /* 0x3e67cde0f86fbdc7 */
+    2.14757072421821274557e-08,  /* 0x3e570f328c889c72 */
+    2.61049792670754218852e-08,  /* 0x3e5c07ae9b994efe */
+    7.81439350674466302231e-09,  /* 0x3e40c8021d7b1698 */
+    3.60125207123751024094e-08,  /* 0x3e635585edb8cb22 */
+    6.15276238179343767917e-08,  /* 0x3e70842567b30e96 */
+    9.54387964641184285058e-08,  /* 0x3e799e811031472e */
+    3.02789566851502754129e-08,  /* 0x3e6041821416bcee */
+    1.16888650949870856331e-07,  /* 0x3e7f6086e4dc96f4 */
+    1.07580956468653338863e-08,  /* 0x3e471a535c5f1b58 */
+    8.33454265379535427653e-08,  /* 0x3e765f743fe63ca1 */
+    1.10790279272629526068e-07,  /* 0x3e7dbd733472d014 */
+    1.08394277896366207424e-07,  /* 0x3e7d18cc4d8b0d1d */
+    9.22176086126841098800e-08,  /* 0x3e78c12553c8fb29 */
+    7.90938592199048786990e-08,  /* 0x3e753b49e2e8f991 */
+    8.66445407164293125637e-08,  /* 0x3e77422ae148c141 */
+    1.40839973537092438671e-08,  /* 0x3e4e3ec269df56a8 */
+    1.19070438507307600689e-07,  /* 0x3e7ff6754e7e0ac9 */
+    6.40451663051716197071e-08,  /* 0x3e7131267b1b5aad */
+    1.08338682076343674522e-07,  /* 0x3e7d14fa403a94bc */
+    3.52999550187922736222e-08,  /* 0x3e62f396c089a3d8 */
+    1.05983273930043077202e-07,  /* 0x3e7c731d78fa95bb */
+    1.05486124078259553339e-07,  /* 0x3e7c50f385177399 */
+    5.82167732281776477773e-08,  /* 0x3e6f41409c6f2c20 */
+    1.08696483983403942633e-07,  /* 0x3e7d2d90c4c39ec0 */
+    4.47335086122377542835e-08,  /* 0x3e680420696f2106 */
+    1.26896287162615723528e-08,  /* 0x3e4b40327943a2e8 */
+    4.06534471589151404531e-08,  /* 0x3e65d35e02f3d2a2 */
+    3.84504846300557026690e-08,  /* 0x3e64a498288117b0 */
+    3.60715006404807269080e-08,  /* 0x3e635da119afb324 */
+    6.44725903165522722801e-08,  /* 0x3e714e85cdb9a908 */
+    3.63749249976409461305e-08,  /* 0x3e638754e5547b9a */
+    1.03901294413833913794e-07,  /* 0x3e7be40ae6ce3246 */
+    6.25379756302167880580e-08,  /* 0x3e70c993b3bea7e7 */
+    6.63984302368488828029e-08,  /* 0x3e71d2dd89ac3359 */
+    3.21844598971548278059e-08,  /* 0x3e61476603332c46 */
+    1.16030611712765830905e-07,  /* 0x3e7f25901bac55b7 */
+    1.17464622142347730134e-07,  /* 0x3e7f881b7c826e28 */
+    7.54604017965808996596e-08,  /* 0x3e7441996d698d20 */
+    1.49234929356206556899e-07,  /* 0x3e8407ac521ea089 */
+    1.41416924523217430259e-07,  /* 0x3e82fb0c6c4b1723 */
+    2.13308065617483489011e-07,  /* 0x3e8ca135966a3e18 */
+    5.04230937933302320146e-08,  /* 0x3e6b1218e4d646e4 */
+    5.45874922281655519035e-08,  /* 0x3e6d4e72a350d288 */
+    1.51849028914786868886e-07,  /* 0x3e84617e2f04c329 */
+    3.09004308703769273010e-08,  /* 0x3e6096ec41e82650 */
+    9.67574548184738317664e-08,  /* 0x3e79f91f25773e6e */
+    4.02508285529322212824e-08,  /* 0x3e659c0820f1d674 */
+    3.01222268096861091157e-08,  /* 0x3e602bf7a2df1064 */
+    2.36189860670079288680e-07,  /* 0x3e8fb36bfc40508f */
+    1.14095158111080887695e-07,  /* 0x3e7ea08f3f8dc892 */
+    7.42349089746573467487e-08,  /* 0x3e73ed6254656a0e */
+    5.12515583196230380184e-08,  /* 0x3e6b83f5e5e69c58 */
+    2.19290391828763918102e-07,  /* 0x3e8d6ec2af768592 */
+    3.83263512187553886471e-08,  /* 0x3e6493889a226f94 */
+    1.61513486284090523855e-07,  /* 0x3e85ad8fa65279ba */
+    5.09996743535589922261e-08,  /* 0x3e6b615784d45434 */
+    1.23694037861246766534e-07,  /* 0x3e809a184368f145 */
+    8.23367955351123783984e-08,  /* 0x3e761a2439b0d91c */
+    1.07591766213053694014e-07,  /* 0x3e7ce1a65e39a978 */
+    1.42789947524631815640e-07,  /* 0x3e832a39a93b6a66 */
+    1.32347123024711878538e-07,  /* 0x3e81c3699af804e7 */
+    2.17626067316598149229e-08,  /* 0x3e575e0f4e44ede8 */
+    2.34454866923044288656e-07,  /* 0x3e8f77ced1a7a83b */
+    2.82966370261766916053e-09,  /* 0x3e284e7f0cb1b500 */
+    2.29300919890907632975e-07,  /* 0x3e8ec6b838b02dfe */
+    1.48428270450261284915e-07,  /* 0x3e83ebf4dfbeda87 */
+    1.87937408574313982512e-07,  /* 0x3e89397aed9cb475 */
+    6.13685946813334055347e-08,  /* 0x3e707937bc239c54 */
+    1.98585022733583817493e-07,  /* 0x3e8aa754553131b6 */
+    7.68394131623752961662e-08,  /* 0x3e74a05d407c45dc */
+    1.28119052312436745644e-07,  /* 0x3e8132231a206dd0 */
+    7.02119104719236502733e-08,  /* 0x3e72d8ecfdd69c88 */
+    9.87954793820636301943e-08,  /* 0x3e7a852c74218606 */
+    1.72176752381034986217e-07,  /* 0x3e871bf2baeebb50 */
+    1.12877225146169704119e-08,  /* 0x3e483d7db7491820 */
+    5.33549829555851737993e-08,  /* 0x3e6ca50d92b6da14 */
+    2.13833275710816521345e-08,  /* 0x3e56f5cde8530298 */
+    1.16243518048290556393e-07,  /* 0x3e7f343198910740 */
+    6.29926408369055877943e-08,  /* 0x3e70e8d241ccd80a */
+    6.45429039328021963791e-08,  /* 0x3e71535ac619e6c8 */
+    8.64001922814281933403e-08,  /* 0x3e77316041c36cd2 */
+    9.50767572202325800240e-08,  /* 0x3e7985a000637d8e */
+    5.80851497508121135975e-08,  /* 0x3e6f2f29858c0a68 */
+    1.82350561135024766232e-07,  /* 0x3e8879847f96d909 */
+    1.98948680587390608655e-07,  /* 0x3e8ab3d319e12e42 */
+    7.83548663450197659846e-08,  /* 0x3e75088162dfc4c2 */
+    3.04374234486798594427e-08,  /* 0x3e605749a1cd9d8c */
+    2.76135725629797411787e-08,  /* 0x3e5da65c6c6b8618 */
+    4.32610105454203065470e-08,  /* 0x3e6739bf7df1ad64 */
+    5.17107515324127256994e-08,  /* 0x3e6bc31252aa3340 */
+    2.82398327875841444660e-08,  /* 0x3e5e528191ad3aa8 */
+    1.87482469524195595399e-07,  /* 0x3e8929d93df19f18 */
+    2.97481891662714096139e-08,  /* 0x3e5ff11eb693a080 */
+    9.94421570843584316402e-09,  /* 0x3e455ae3f145a3a0 */
+    1.07056210730391848428e-07,  /* 0x3e7cbcd8c6c0ca82 */
+    6.25589580466881163081e-08,  /* 0x3e70cb04d425d304 */
+    9.56641013869464593803e-08,  /* 0x3e79adfcab5be678 */
+    1.88056307148355440276e-07,  /* 0x3e893d90c5662508 */
+    8.38850689379557880950e-08,  /* 0x3e768489bd35ff40 */
+    5.01215865527674122924e-09,  /* 0x3e3586ed3da2b7e0 */
+    1.74166095998522089762e-07,  /* 0x3e87604d2e850eee */
+    9.96779574395363585849e-08,  /* 0x3e7ac1d12bfb53d8 */
+    5.98432026368321460686e-09,  /* 0x3e39b3d468274740 */
+    1.18362922366887577169e-07,  /* 0x3e7fc5d68d10e53c */
+    1.86086833284154215946e-07,  /* 0x3e88f9e51884becb */
+    1.97671457251348941011e-07,  /* 0x3e8a87f0869c06d1 */
+    1.42447160717199237159e-07,  /* 0x3e831e7279f685fa */
+    1.05504240785546574184e-08,  /* 0x3e46a8282f9719b0 */
+    3.13335218371639189324e-08,  /* 0x3e60d2724a8a44e0 */
+    1.96518418901914535399e-07,  /* 0x3e8a60524b11ad4e */
+    2.17692035039173536059e-08,  /* 0x3e575fdf832750f0 */
+    2.15613114426529981675e-07,  /* 0x3e8cf06902e4cd36 */
+    5.68271098300441214948e-08,  /* 0x3e6e82422d4f6d10 */
+    1.70331455823369124256e-08,  /* 0x3e524a091063e6c0 */
+    9.17590028095709583247e-08,  /* 0x3e78a1a172dc6f38 */
+    2.77266304112916566247e-07,  /* 0x3e929b6619f8a92d */
+    9.37041937614656939690e-08,  /* 0x3e79274d9c1b70c8 */
+    1.56116346368316796511e-08,  /* 0x3e50c34b1fbb7930 */
+    4.13967433808382727413e-08,  /* 0x3e6639866c20eb50 */
+    1.70164749185821616276e-07,  /* 0x3e86d6d0f6832e9e */
+    4.01708788545600086008e-07,  /* 0x3e9af54def99f25e */
+    2.59663539226050551563e-07,  /* 0x3e916cfc52a00262 */
+    2.22007487655027469542e-07,  /* 0x3e8dcc1e83569c32 */
+    2.90542250809644081369e-07,  /* 0x3e937f7a551ed425 */
+    4.67720537666628903341e-07,  /* 0x3e9f6360adc98887 */
+    2.79799803956772554802e-07,  /* 0x3e92c6ec8d35a2c1 */
+    2.07344552327432547723e-07,  /* 0x3e8bd44df84cb036 */
+    2.54705698692735196368e-07,  /* 0x3e9117cf826e310e */
+    4.26848589539548450728e-07,  /* 0x3e9ca533f332cfc9 */
+    2.52506723633552216197e-07,  /* 0x3e90f208509dbc2e */
+    2.14684129933849704964e-07,  /* 0x3e8cd07d93c945de */
+    3.20134822201596505431e-07,  /* 0x3e957bdfd67e6d72 */
+    9.93537565749855712134e-08,  /* 0x3e7aab89c516c658 */
+    3.70792944827917252327e-08,  /* 0x3e63e823b1a1b8a0 */
+    1.41772749369083698972e-07,  /* 0x3e8307464a9d6d3c */
+    4.22446601490198804306e-07,  /* 0x3e9c5993cd438843 */
+    4.11818433724801511540e-07,  /* 0x3e9ba2fca02ab554 */
+    1.19976381502605310519e-07,  /* 0x3e801a5b6983a268 */
+    3.43703078571520905265e-08,  /* 0x3e6273d1b350efc8 */
+    1.66128705555453270379e-07,  /* 0x3e864c238c37b0c6 */
+    5.00499610023283006540e-08,  /* 0x3e6aded07370a300 */
+    1.75105139941208062123e-07,  /* 0x3e878091197eb47e */
+    7.70807146729030327334e-08,  /* 0x3e74b0f245e0dabc */
+    2.45918607526895836121e-07,  /* 0x3e9080d9794e2eaf */
+    2.18359020958626199345e-07,  /* 0x3e8d4ec242b60c76 */
+    8.44342887976445333569e-09,  /* 0x3e4221d2f940caa0 */
+    1.07506148687888629299e-07,  /* 0x3e7cdbc42b2bba5c */
+    5.36544954316820904572e-08,  /* 0x3e6cce37bb440840 */
+    3.39109101518396596341e-07,  /* 0x3e96c1d999cf1dd0 */
+    2.60098720293920613340e-08,  /* 0x3e5bed8a07eb0870 */
+    8.42678991664621455827e-08,  /* 0x3e769ed88f490e3c */
+    5.36972237470183633197e-08,  /* 0x3e6cd41719b73ef0 */
+    4.28192558171921681288e-07,  /* 0x3e9cbc4ac95b41b7 */
+    2.71535491483955143294e-07,  /* 0x3e9238f1b890f5d7 */
+    7.84094998145075780203e-08,  /* 0x3e750c4282259cc4 */
+    3.43880599134117431863e-07,  /* 0x3e9713d2de87b3e2 */
+    1.32878065060366481043e-07,  /* 0x3e81d5a7d2255276 */
+    4.18046802627967629428e-07,  /* 0x3e9c0dfd48227ac1 */
+    2.65042411765766019424e-07,  /* 0x3e91c964dab76753 */
+    1.70383695347518643694e-07,  /* 0x3e86de56d5704496 */
+    1.54096497259613515678e-07,  /* 0x3e84aeb71fd19968 */
+    2.36543402412459813461e-07,  /* 0x3e8fbf91c57b1918 */
+    4.38416350106876736790e-07,  /* 0x3e9d6bef7fbe5d9a */
+    3.03892161339927775731e-07,  /* 0x3e9464d3dc249066 */
+    3.31136771605664899240e-07,  /* 0x3e9638e2ec4d9073 */
+    6.49494294526590682218e-08,  /* 0x3e716f4a7247ea7c */
+    4.10423429887181345747e-09,  /* 0x3e31a0a740f1d440 */
+    1.70831640869113847224e-07,  /* 0x3e86edbb0114a33c */
+    1.10811512657909180966e-07,  /* 0x3e7dbee8bf1d513c */
+    3.23677724749783611964e-07,  /* 0x3e95b8bdb0248f73 */
+    3.55662734259192678528e-07,  /* 0x3e97de3d3f5eac64 */
+    2.30102333489738219140e-07,  /* 0x3e8ee24187ae448a */
+    4.47429004000738629714e-07,  /* 0x3e9e06c591ec5192 */
+    7.78167135617329598659e-08,  /* 0x3e74e3861a332738 */
+    9.90345291908535415737e-08,  /* 0x3e7a9599dcc2bfe4 */
+    5.85800913143113728314e-08,  /* 0x3e6f732fbad43468 */
+    4.57859062410871843857e-07,  /* 0x3e9eb9f573b727d9 */
+    3.67993069723390929794e-07,  /* 0x3e98b212a2eb9897 */
+    2.90836464322977276043e-07,  /* 0x3e9384884c167215 */
+    2.51621574250131388318e-07,  /* 0x3e90e2d363020051 */
+    2.75789824740652815545e-07,  /* 0x3e92820879fbd022 */
+    3.88985776250314403593e-07,  /* 0x3e9a1ab9893e4b30 */
+    1.40214080183768019611e-07,  /* 0x3e82d1b817a24478 */
+    3.23451432223550478373e-08,  /* 0x3e615d7b8ded4878 */
+    9.15979180730608444470e-08,  /* 0x3e78968f9db3a5e4 */
+    3.44371402498640470421e-07,  /* 0x3e971c4171fe135f */
+    3.40401897215059498077e-07,  /* 0x3e96d80f605d0d8c */
+    1.06431813453707950243e-07,  /* 0x3e7c91f043691590 */
+    1.46204238932338846248e-07,  /* 0x3e839f8a15fce2b2 */
+    9.94610376972039046878e-09,  /* 0x3e455beda9d94b80 */
+    2.01711528092681771039e-07,  /* 0x3e8b12c15d60949a */
+    2.72027977986191568296e-07,  /* 0x3e924167b312bfe3 */
+    2.48402602511693757964e-07,  /* 0x3e90ab8633070277 */
+    1.58480011219249621715e-07,  /* 0x3e854554ebbc80ee */
+    3.00372828113368713281e-08,  /* 0x3e60204aef5a4bb8 */
+    3.67816204583541976394e-07,  /* 0x3e98af08c679cf2c */
+    2.46169793032343824291e-07,  /* 0x3e90852a330ae6c8 */
+    1.70080468270204253247e-07,  /* 0x3e86d3eb9ec32916 */
+    1.67806717763872914315e-07,  /* 0x3e8685cb7fcbbafe */
+    2.67715622006907942620e-07,  /* 0x3e91f751c1e0bd95 */
+    2.14411342550299170574e-08,  /* 0x3e5705b1b0f72560 */
+    4.11228221283669073277e-07,  /* 0x3e9b98d8d808ca92 */
+    3.52311752396749662260e-08,  /* 0x3e62ea22c75cc980 */
+    3.52718000397367821054e-07,  /* 0x3e97aba62bca0350 */
+    4.38857387992911129814e-07,  /* 0x3e9d73833442278c */
+    3.22574606753482540743e-07,  /* 0x3e95a5ca1fb18bf9 */
+    3.28730371182804296828e-08,  /* 0x3e61a6092b6ecf28 */
+    7.56672470607639279700e-08,  /* 0x3e744fd049aac104 */
+    3.26750155316369681821e-09,  /* 0x3e2c114fd8df5180 */
+    3.21724445362095284743e-07,  /* 0x3e95972f130feae5 */
+    1.06639427371776571151e-07,  /* 0x3e7ca034a55fe198 */
+    3.41020788139524715063e-07,  /* 0x3e96e2b149990227 */
+    1.00582838631232552824e-07,  /* 0x3e7b00000294592c */
+    3.68439433859276640065e-07,  /* 0x3e98b9bdc442620e */
+    2.20403078342388012027e-07,  /* 0x3e8d94fdfabf3e4e */
+    1.62841467098298142534e-07,  /* 0x3e85db30b145ad9a */
+    2.25325348296680733838e-07,  /* 0x3e8e3e1eb95022b0 */
+    4.37462238226421614339e-07,  /* 0x3e9d5b8b45442bd6 */
+    3.52055880555040706500e-07,  /* 0x3e97a046231ecd2e */
+    4.75614398494781776825e-07,  /* 0x3e9feafe3ef55232 */
+    3.60998399033215317516e-07,  /* 0x3e9839e7bfd78267 */
+    3.79292434611513945954e-08,  /* 0x3e645cf49d6fa900 */
+    1.29859015528549300061e-08,  /* 0x3e4be3132b27f380 */
+    3.15927546985474913188e-07,  /* 0x3e9533980bb84f9f */
+    2.28533679887379668031e-08,  /* 0x3e5889e2ce3ba390 */
+    1.17222541823553133877e-07,  /* 0x3e7f7778c3ad0cc8 */
+    1.51991208405464415857e-07,  /* 0x3e846660cec4eba2 */
+    1.56958239325240655564e-07}; /* 0x3e85110b4611a626 */
+
+  /* Some constants and split constants. */
+
+  static double pi = 3.1415926535897932e+00, /* 0x400921fb54442d18 */
+             piby2 = 1.5707963267948966e+00, /* 0x3ff921fb54442d18 */
+             piby4 = 7.8539816339744831e-01, /* 0x3fe921fb54442d18 */
+       three_piby4 = 2.3561944901923449e+00, /* 0x4002d97c7f3321d2 */
+           pi_head = 3.1415926218032836e+00, /* 0x400921fb50000000 */
+           pi_tail = 3.1786509547056392e-08, /* 0x3e6110b4611a6263 */
+        piby2_head = 1.5707963267948965e+00, /* 0x3ff921fb54442d18 */
+        piby2_tail = 6.1232339957367660e-17; /* 0x3c91a62633145c07 */
+
+  double u, v, vbyu, q1, q2, s, u1, vu1, u2, vu2, uu, c, r;
+  unsigned int swap_vu, index, xzero, yzero, xnan, ynan, xinf, yinf;
+  int m, xexp, yexp, diffexp;
+
+  /* Find properties of arguments x and y. */
+
+  unsigned long long ux, ui, aux, xneg, uy, auy, yneg;
+
+  GET_BITS_DP64(x, ux);
+  GET_BITS_DP64(y, uy);
+  aux = ux & ~SIGNBIT_DP64;
+  auy = uy & ~SIGNBIT_DP64;
+  xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
+  yexp = (int)((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
+  xneg = ux & SIGNBIT_DP64;
+  yneg = uy & SIGNBIT_DP64;
+  xzero = (aux == 0);
+  yzero = (auy == 0);
+  xnan = (aux > PINFBITPATT_DP64);
+  ynan = (auy > PINFBITPATT_DP64);
+  xinf = (aux == PINFBITPATT_DP64);
+  yinf = (auy == PINFBITPATT_DP64);
+
+  diffexp = yexp - xexp;
+
+  /* Special cases */
+
+  if (xnan)
+#ifdef WINDOWS
+    return handle_error("atan2", ux|0x0008000000000000, _DOMAIN, 0,
+                        EDOM, x, y);
+#else
+    return x + x; /* Raise invalid if it's a signalling NaN */
+#endif
+  else if (ynan)
+#ifdef WINDOWS
+    return handle_error("atan2", uy|0x0008000000000000, _DOMAIN, 0,
+                        EDOM, x, y);
+#else
+    return y + y; /* Raise invalid if it's a signalling NaN */
+#endif
+  else if (yzero)
+    { /* Zero y gives +-0 for positive x
+         and +-pi for negative x */
+#ifndef WINDOWS
+      if ((_LIB_VERSION == _SVID_) && xzero)
+        /* Sigh - _SVID_ defines atan2(0,0) as a domain error */
+        return retval_errno_edom(x, y);
+      else
+#endif
+      if (xneg)
+	{
+	  if (yneg) return val_with_flags(-pi,AMD_F_INEXACT);
+          else return val_with_flags(pi,AMD_F_INEXACT);
+	}
+      else return y;
+    }
+  else if (xzero)
+    { /* Zero x gives +- pi/2
+         depending on sign of y */
+      if (yneg) return val_with_flags(-piby2,AMD_F_INEXACT);
+      else val_with_flags(piby2,AMD_F_INEXACT);
+    }
+
+  /* Scale up both x and y if they are both below 1/4.
+     This avoids any possible later denormalised arithmetic. */
+
+  if ((xexp < 1021 && yexp < 1021))
+    {
+      scaleUpDouble1024(ux, &ux);
+      scaleUpDouble1024(uy, &uy);
+      PUT_BITS_DP64(ux, x);
+      PUT_BITS_DP64(uy, y);
+      xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
+      yexp = (int)((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
+      diffexp = yexp - xexp;
+    }
+
+  if (diffexp > 56)
+    { /* abs(y)/abs(x) > 2^56 => arctan(x/y)
+         is insignificant compared to piby2 */
+      if (yneg) return val_with_flags(-piby2,AMD_F_INEXACT);
+      else return val_with_flags(piby2,AMD_F_INEXACT);
+    }
+  else if (diffexp < -28 && (!xneg))
+    { /* x positive and dominant over y by a factor of 2^28.
+         In this case atan(y/x) is y/x to machine accuracy. */
+
+      if (diffexp < -1074) /* Result underflows */
+        {
+          if (yneg)
+            return val_with_flags(-0.0,AMD_F_INEXACT | AMD_F_UNDERFLOW);
+          else
+            return val_with_flags(0.0,AMD_F_INEXACT | AMD_F_UNDERFLOW);
+        }
+      else
+        {
+          if (diffexp < -1022)
+            {
+              /* Result will likely be denormalized */
+              y = scaleDouble_1(y, 100);
+              y /= x;
+              /* Now y is 2^100 times the true result. Scale it back down. */
+              GET_BITS_DP64(y, uy);
+	      scaleDownDouble(uy, 100, &uy);
+              PUT_BITS_DP64(uy, y);
+	      if ((uy & EXPBITS_DP64) == 0)
+		return val_with_flags(y, AMD_F_INEXACT | AMD_F_UNDERFLOW);
+	      else
+		return y;
+             }
+          else
+            return y / x;
+        }
+    }
+  else if (diffexp < -56 && xneg)
+    { /* abs(x)/abs(y) > 2^56 and x < 0 => arctan(y/x)
+         is insignificant compared to pi */
+    if (yneg) return val_with_flags(-pi,AMD_F_INEXACT);
+    else return val_with_flags(pi,AMD_F_INEXACT);
+    }
+  else if (yinf && xinf)
+    { /* If abs(x) and abs(y) are both infinity
+         return +-pi/4 or +- 3pi/4 according to
+         signs.  */
+    if (xneg)
+      {
+      if (yneg) return val_with_flags(-three_piby4,AMD_F_INEXACT);
+      else return val_with_flags(three_piby4,AMD_F_INEXACT);
+      }
+    else
+      {
+      if (yneg) return val_with_flags(-piby4,AMD_F_INEXACT);
+      else return val_with_flags(piby4,AMD_F_INEXACT);
+      }
+    }
+
+  /* General case: take absolute values of arguments */
+
+  u = x; v = y;
+  if (xneg) u = -x;
+  if (yneg) v = -y;
+
+  /* Swap u and v if necessary to obtain 0 < v < u. Compute v/u. */
+
+  swap_vu = (u < v);
+  if (swap_vu) { uu = u; u = v; v = uu; }
+  vbyu = v/u;
+
+  if (vbyu > 0.0625)
+    { /* General values of v/u. Use a look-up
+         table and series expansion. */
+
+      index = (int)(256*vbyu + 0.5);
+      q1 = atan_jby256_lead[index-16];
+      q2 = atan_jby256_tail[index-16];
+      c = index*1./256;
+      GET_BITS_DP64(u, ui);
+      m = (int)((ui & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64;
+      u = scaleDouble_2(u,-m);
+      v = scaleDouble_2(v,-m);
+      GET_BITS_DP64(u, ui);
+      PUT_BITS_DP64(0xfffffffff8000000 & ui, u1); /* 26 leading bits of u */
+      u2 = u - u1;
+
+      r = ((v-c*u1)-c*u2)/(u+c*v);
+
+      /* Polynomial approximation to atan(r) */
+
+      s = r*r;
+      q2 = q2 + r - r*(s * (0.33333333333224095522 - s*(0.19999918038989143496)));
+    }
+  else if (vbyu < 1.e-8)
+    { /* v/u is small enough that atan(v/u) = v/u */
+      q1 = 0.0;
+      q2 = vbyu;
+    }
+  else  /* vbyu <= 0.0625 */
+    {
+      /* Small values of v/u. Use a series expansion
+	 computed carefully to minimise cancellation */
+
+      GET_BITS_DP64(u, ui);
+      PUT_BITS_DP64(0xffffffff00000000 & ui, u1);
+      GET_BITS_DP64(vbyu, ui);
+      PUT_BITS_DP64(0xffffffff00000000 & ui, vu1);
+      u2 = u - u1;
+      vu2 = vbyu - vu1;
+
+      q1 = 0.0;
+      s  = vbyu*vbyu;
+      q2 = vbyu +
+	((((v - u1*vu1) - u2*vu1) - u*vu2)/u -
+	 (vbyu*s*(0.33333333333333170500 -
+		  s*(0.19999999999393223405 -
+		     s*(0.14285713561807169030 -
+			s*(0.11110736283514525407 -
+			   s*(0.90029810285449784439E-01)))))));
+    }
+
+  /* Tidy-up according to which quadrant the arguments lie in */
+
+  if (swap_vu) {q1 = piby2_head - q1; q2 = piby2_tail - q2;}
+  if (xneg) {q1 = pi_head - q1; q2 = pi_tail - q2;}
+  q1 = q1 + q2;
+
+  if (yneg) q1 = - q1;
+
+  return q1;
+}
+
+weak_alias (__atan2, atan2)
diff --git a/src/atan2f.c b/src/atan2f.c
new file mode 100644
index 0000000..9b53c6f
--- /dev/null
+++ b/src/atan2f.c
@@ -0,0 +1,500 @@
+
+/*
+*  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+*
+*  This file is part of libacml_mv.
+*
+*  libacml_mv is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  libacml_mv is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with libacml_mv.  If not, see
+*  <http://www.gnu.org/licenses/>.
+*
+*/
+
+
+
+#include "../inc/libm_amd.h"
+#include "../inc/libm_util_amd.h"
+
+#define USE_VALF_WITH_FLAGS
+#define USE_NAN_WITH_FLAGS
+#define USE_SCALEDOUBLE_1
+#define USE_SCALEDOWNDOUBLE
+#define USE_HANDLE_ERRORF
+#include "../inc/libm_inlines_amd.h"
+#undef USE_VALF_WITH_FLAGS
+#undef USE_NAN_WITH_FLAGS
+#undef USE_SCALEDOUBLE_1
+#undef USE_SCALEDOWNDOUBLE
+#undef USE_HANDLE_ERRORF
+
+#include "../inc/libm_errno_amd.h"
+
+#ifndef WINDOWS
+/* Deal with errno for out-of-range arguments
+   (only used when _LIB_VERSION is _SVID_) */
+static inline float retval_errno_edom(float x, float y)
+{
+  struct exception exc;
+  exc.arg1 = (double)x;
+  exc.arg2 = (double)y;
+  exc.type = DOMAIN;
+  exc.name = (char *)"atan2f";
+  exc.retval = HUGE;
+  if (!matherr(&exc))
+    {
+      (void)fputs("atan2f: DOMAIN error\n", stderr);
+      __set_errno(EDOM);
+    }
+  return exc.retval;
+}
+#endif
+
+#ifdef WINDOWS
+#pragma function(atan2f)
+#endif
+
+float FN_PROTOTYPE(atan2f)(float fy, float fx)
+{
+  /* Array atan_jby256 contains precomputed values of atan(j/256),
+     for j = 16, 17, ..., 256. */
+
+  static const double atan_jby256[  241] = {
+    6.24188099959573430842e-02,  /* 0x3faff55bb72cfde9 */
+    6.63088949198234745008e-02,  /* 0x3fb0f99ea71d52a6 */
+    7.01969710718705064423e-02,  /* 0x3fb1f86dbf082d58 */
+    7.40829225490337306415e-02,  /* 0x3fb2f719318a4a9a */
+    7.79666338315423007588e-02,  /* 0x3fb3f59f0e7c559d */
+    8.18479898030765457007e-02,  /* 0x3fb4f3fd677292fb */
+    8.57268757707448092464e-02,  /* 0x3fb5f2324fd2d7b2 */
+    8.96031774848717321724e-02,  /* 0x3fb6f03bdcea4b0c */
+    9.34767811585894559112e-02,  /* 0x3fb7ee182602f10e */
+    9.73475734872236708739e-02,  /* 0x3fb8ebc54478fb28 */
+    1.01215441667466668485e-01,  /* 0x3fb9e94153cfdcf1 */
+    1.05080273416329528224e-01,  /* 0x3fbae68a71c722b8 */
+    1.08941956989865793015e-01,  /* 0x3fbbe39ebe6f07c3 */
+    1.12800381201659388752e-01,  /* 0x3fbce07c5c3cca32 */
+    1.16655435441069349478e-01,  /* 0x3fbddd21701eba6e */
+    1.20507009691224548087e-01,  /* 0x3fbed98c2190043a */
+    1.24354994546761424279e-01,  /* 0x3fbfd5ba9aac2f6d */
+    1.28199281231298117811e-01,  /* 0x3fc068d584212b3d */
+    1.32039761614638734288e-01,  /* 0x3fc0e6adccf40881 */
+    1.35876328229701304195e-01,  /* 0x3fc1646541060850 */
+    1.39708874289163620386e-01,  /* 0x3fc1e1fafb043726 */
+    1.43537293701821222491e-01,  /* 0x3fc25f6e171a535c */
+    1.47361481088651630200e-01,  /* 0x3fc2dcbdb2fba1ff */
+    1.51181331798580037562e-01,  /* 0x3fc359e8edeb99a3 */
+    1.54996741923940972718e-01,  /* 0x3fc3d6eee8c6626c */
+    1.58807608315631065832e-01,  /* 0x3fc453cec6092a9e */
+    1.62613828597948567589e-01,  /* 0x3fc4d087a9da4f17 */
+    1.66415301183114927586e-01,  /* 0x3fc54d18ba11570a */
+    1.70211925285474380276e-01,  /* 0x3fc5c9811e3ec269 */
+    1.74003600935367680469e-01,  /* 0x3fc645bfffb3aa73 */
+    1.77790228992676047071e-01,  /* 0x3fc6c1d4898933d8 */
+    1.81571711160032150945e-01,  /* 0x3fc73dbde8a7d201 */
+    1.85347949995694760705e-01,  /* 0x3fc7b97b4bce5b02 */
+    1.89118848926083965578e-01,  /* 0x3fc8350be398ebc7 */
+    1.92884312257974643856e-01,  /* 0x3fc8b06ee2879c28 */
+    1.96644245190344985064e-01,  /* 0x3fc92ba37d050271 */
+    2.00398553825878511514e-01,  /* 0x3fc9a6a8e96c8626 */
+    2.04147145182116990236e-01,  /* 0x3fca217e601081a5 */
+    2.07889927202262986272e-01,  /* 0x3fca9c231b403279 */
+    2.11626808765629753628e-01,  /* 0x3fcb1696574d780b */
+    2.15357699697738047551e-01,  /* 0x3fcb90d7529260a2 */
+    2.19082510780057748701e-01,  /* 0x3fcc0ae54d768466 */
+    2.22801153759394493514e-01,  /* 0x3fcc84bf8a742e6d */
+    2.26513541356919617664e-01,  /* 0x3fccfe654e1d5395 */
+    2.30219587276843717927e-01,  /* 0x3fcd77d5df205736 */
+    2.33919206214733416127e-01,  /* 0x3fcdf110864c9d9d */
+    2.37612313865471241892e-01,  /* 0x3fce6a148e96ec4d */
+    2.41298826930858800743e-01,  /* 0x3fcee2e1451d980c */
+    2.44978663126864143473e-01,  /* 0x3fcf5b75f92c80dd */
+    2.48651741190513253521e-01,  /* 0x3fcfd3d1fc40dbe4 */
+    2.52317980886427151166e-01,  /* 0x3fd025fa510665b5 */
+    2.55977303013005474952e-01,  /* 0x3fd061eea03d6290 */
+    2.59629629408257511791e-01,  /* 0x3fd09dc597d86362 */
+    2.63274882955282396590e-01,  /* 0x3fd0d97ee509acb3 */
+    2.66912987587400396539e-01,  /* 0x3fd1151a362431c9 */
+    2.70543868292936529052e-01,  /* 0x3fd150973a9ce546 */
+    2.74167451119658789338e-01,  /* 0x3fd18bf5a30bf178 */
+    2.77783663178873208022e-01,  /* 0x3fd1c735212dd883 */
+    2.81392432649178403370e-01,  /* 0x3fd2025567e47c95 */
+    2.84993688779881237938e-01,  /* 0x3fd23d562b381041 */
+    2.88587361894077354396e-01,  /* 0x3fd278372057ef45 */
+    2.92173383391398755471e-01,  /* 0x3fd2b2f7fd9b5fe2 */
+    2.95751685750431536626e-01,  /* 0x3fd2ed987a823cfe */
+    2.99322202530807379706e-01,  /* 0x3fd328184fb58951 */
+    3.02884868374971361060e-01,  /* 0x3fd362773707ebcb */
+    3.06439619009630070945e-01,  /* 0x3fd39cb4eb76157b */
+    3.09986391246883430384e-01,  /* 0x3fd3d6d129271134 */
+    3.13525122985043869228e-01,  /* 0x3fd410cbad6c7d32 */
+    3.17055753209146973237e-01,  /* 0x3fd44aa436c2af09 */
+    3.20578221991156986359e-01,  /* 0x3fd4845a84d0c21b */
+    3.24092470489871664618e-01,  /* 0x3fd4bdee586890e6 */
+    3.27598440950530811477e-01,  /* 0x3fd4f75f73869978 */
+    3.31096076704132047386e-01,  /* 0x3fd530ad9951cd49 */
+    3.34585322166458920545e-01,  /* 0x3fd569d88e1b4cd7 */
+    3.38066122836825466713e-01,  /* 0x3fd5a2e0175e0f4e */
+    3.41538425296541714449e-01,  /* 0x3fd5dbc3fbbe768d */
+    3.45002177207105076295e-01,  /* 0x3fd614840309cfe1 */
+    3.48457327308122011278e-01,  /* 0x3fd64d1ff635c1c5 */
+    3.51903825414964732676e-01,  /* 0x3fd685979f5fa6fd */
+    3.55341622416168290144e-01,  /* 0x3fd6bdeac9cbd76c */
+    3.58770670270572189509e-01,  /* 0x3fd6f61941e4def0 */
+    3.62190922004212156882e-01,  /* 0x3fd72e22d53aa2a9 */
+    3.65602331706966821034e-01,  /* 0x3fd7660752817501 */
+    3.69004854528964421068e-01,  /* 0x3fd79dc6899118d1 */
+    3.72398446676754202311e-01,  /* 0x3fd7d5604b63b3f7 */
+    3.75783065409248884237e-01,  /* 0x3fd80cd46a14b1d0 */
+    3.79158669033441808605e-01,  /* 0x3fd84422b8df95d7 */
+    3.82525216899905096124e-01,  /* 0x3fd87b4b0c1ebedb */
+    3.85882669398073752109e-01,  /* 0x3fd8b24d394a1b25 */
+    3.89230987951320717144e-01,  /* 0x3fd8e92916f5cde8 */
+    3.92570135011828580396e-01,  /* 0x3fd91fde7cd0c662 */
+    3.95900074055262896078e-01,  /* 0x3fd9566d43a34907 */
+    3.99220769575252543149e-01,  /* 0x3fd98cd5454d6b18 */
+    4.02532187077682512832e-01,  /* 0x3fd9c3165cc58107 */
+    4.05834293074804064450e-01,  /* 0x3fd9f93066168001 */
+    4.09127055079168300278e-01,  /* 0x3fda2f233e5e530b */
+    4.12410441597387267265e-01,  /* 0x3fda64eec3cc23fc */
+    4.15684422123729413467e-01,  /* 0x3fda9a92d59e98cf */
+    4.18948967133552840902e-01,  /* 0x3fdad00f5422058b */
+    4.22204048076583571270e-01,  /* 0x3fdb056420ae9343 */
+    4.25449637370042266227e-01,  /* 0x3fdb3a911da65c6c */
+    4.28685708391625730496e-01,  /* 0x3fdb6f962e737efb */
+    4.31912235472348193799e-01,  /* 0x3fdba473378624a5 */
+    4.35129193889246812521e-01,  /* 0x3fdbd9281e528191 */
+    4.38336559857957774877e-01,  /* 0x3fdc0db4c94ec9ef */
+    4.41534310525166673322e-01,  /* 0x3fdc42191ff11eb6 */
+    4.44722423960939305942e-01,  /* 0x3fdc76550aad71f8 */
+    4.47900879150937292206e-01,  /* 0x3fdcaa6872f3631b */
+    4.51069655988523443568e-01,  /* 0x3fdcde53432c1350 */
+    4.54228735266762495559e-01,  /* 0x3fdd121566b7f2ad */
+    4.57378098670320809571e-01,  /* 0x3fdd45aec9ec862b */
+    4.60517728767271039558e-01,  /* 0x3fdd791f5a1226f4 */
+    4.63647609000806093515e-01,  /* 0x3fddac670561bb4f */
+    4.66767723680866497560e-01,  /* 0x3fdddf85bb026974 */
+    4.69878057975686880265e-01,  /* 0x3fde127b6b0744af */
+    4.72978597903265574054e-01,  /* 0x3fde4548066cf51a */
+    4.76069330322761219421e-01,  /* 0x3fde77eb7f175a34 */
+    4.79150242925822533735e-01,  /* 0x3fdeaa65c7cf28c4 */
+    4.82221324227853687105e-01,  /* 0x3fdedcb6d43f8434 */
+    4.85282563559221225002e-01,  /* 0x3fdf0ede98f393cf */
+    4.88333951056405479729e-01,  /* 0x3fdf40dd0b541417 */
+    4.91375477653101910835e-01,  /* 0x3fdf72b221a4e495 */
+    4.94407135071275316562e-01,  /* 0x3fdfa45dd3029258 */
+    4.97428915812172245392e-01,  /* 0x3fdfd5e0175fdf83 */
+    5.00440813147294050189e-01,  /* 0x3fe0039c73c1a40b */
+    5.03442821109336358099e-01,  /* 0x3fe01c341e82422d */
+    5.06434934483096732549e-01,  /* 0x3fe034b709250488 */
+    5.09417148796356245022e-01,  /* 0x3fe04d25314342e5 */
+    5.12389460310737621107e-01,  /* 0x3fe0657e94db30cf */
+    5.15351866012543347040e-01,  /* 0x3fe07dc3324e9b38 */
+    5.18304363603577900044e-01,  /* 0x3fe095f30861a58f */
+    5.21246951491958210312e-01,  /* 0x3fe0ae0e1639866c */
+    5.24179628782913242802e-01,  /* 0x3fe0c6145b5b43da */
+    5.27102395269579471204e-01,  /* 0x3fe0de05d7aa6f7c */
+    5.30015251423793132268e-01,  /* 0x3fe0f5e28b67e295 */
+    5.32918198386882147055e-01,  /* 0x3fe10daa77307a0d */
+    5.35811237960463593311e-01,  /* 0x3fe1255d9bfbd2a8 */
+    5.38694372597246617929e-01,  /* 0x3fe13cfbfb1b056e */
+    5.41567605391844897333e-01,  /* 0x3fe1548596376469 */
+    5.44430940071603086672e-01,  /* 0x3fe16bfa6f5137e1 */
+    5.47284380987436924748e-01,  /* 0x3fe1835a88be7c13 */
+    5.50127933104692989907e-01,  /* 0x3fe19aa5e5299f99 */
+    5.52961601994028217888e-01,  /* 0x3fe1b1dc87904284 */
+    5.55785393822313511514e-01,  /* 0x3fe1c8fe7341f64f */
+    5.58599315343562330405e-01,  /* 0x3fe1e00babdefeb3 */
+    5.61403373889889367732e-01,  /* 0x3fe1f7043557138a */
+    5.64197577362497537656e-01,  /* 0x3fe20de813e823b1 */
+    5.66981934222700489912e-01,  /* 0x3fe224b74c1d192a */
+    5.69756453482978431069e-01,  /* 0x3fe23b71e2cc9e6a */
+    5.72521144698072359525e-01,  /* 0x3fe25217dd17e501 */
+    5.75276017956117824426e-01,  /* 0x3fe268a940696da6 */
+    5.78021083869819540801e-01,  /* 0x3fe27f261273d1b3 */
+    5.80756353567670302596e-01,  /* 0x3fe2958e59308e30 */
+    5.83481838685214859730e-01,  /* 0x3fe2abe21aded073 */
+    5.86197551356360535557e-01,  /* 0x3fe2c2215e024465 */
+    5.88903504204738026395e-01,  /* 0x3fe2d84c2961e48b */
+    5.91599710335111383941e-01,  /* 0x3fe2ee628406cbca */
+    5.94286183324841177367e-01,  /* 0x3fe30464753b090a */
+    5.96962937215401501234e-01,  /* 0x3fe31a52048874be */
+    5.99629986503951384336e-01,  /* 0x3fe3302b39b78856 */
+    6.02287346134964152178e-01,  /* 0x3fe345f01cce37bb */
+    6.04935031491913965951e-01,  /* 0x3fe35ba0b60eccce */
+    6.07573058389022313541e-01,  /* 0x3fe3713d0df6c503 */
+    6.10201443063065118722e-01,  /* 0x3fe386c52d3db11e */
+    6.12820202165241245673e-01,  /* 0x3fe39c391cd41719 */
+    6.15429352753104952356e-01,  /* 0x3fe3b198e5e2564a */
+    6.18028912282561737612e-01,  /* 0x3fe3c6e491c78dc4 */
+    6.20618898599929469384e-01,  /* 0x3fe3dc1c2a188504 */
+    6.23199329934065904268e-01,  /* 0x3fe3f13fb89e96f4 */
+    6.25770224888563042498e-01,  /* 0x3fe4064f47569f48 */
+    6.28331602434009650615e-01,  /* 0x3fe41b4ae06fea41 */
+    6.30883481900321840818e-01,  /* 0x3fe430328e4b26d5 */
+    6.33425882969144482537e-01,  /* 0x3fe445065b795b55 */
+    6.35958825666321447834e-01,  /* 0x3fe459c652badc7f */
+    6.38482330354437466191e-01,  /* 0x3fe46e727efe4715 */
+    6.40996417725432032775e-01,  /* 0x3fe4830aeb5f7bfd */
+    6.43501108793284370968e-01,  /* 0x3fe4978fa3269ee1 */
+    6.45996424886771558604e-01,  /* 0x3fe4ac00b1c71762 */
+    6.48482387642300484032e-01,  /* 0x3fe4c05e22de94e4 */
+    6.50959018996812410762e-01,  /* 0x3fe4d4a8023414e8 */
+    6.53426341180761927063e-01,  /* 0x3fe4e8de5bb6ec04 */
+    6.55884376711170835605e-01,  /* 0x3fe4fd013b7dd17e */
+    6.58333148384755983962e-01,  /* 0x3fe51110adc5ed81 */
+    6.60772679271132590273e-01,  /* 0x3fe5250cbef1e9fa */
+    6.63202992706093175102e-01,  /* 0x3fe538f57b89061e */
+    6.65624112284960989250e-01,  /* 0x3fe54ccaf0362c8f */
+    6.68036061856020157990e-01,  /* 0x3fe5608d29c70c34 */
+    6.70438865514021320458e-01,  /* 0x3fe5743c352b33b9 */
+    6.72832547593763097282e-01,  /* 0x3fe587d81f732fba */
+    6.75217132663749830535e-01,  /* 0x3fe59b60f5cfab9d */
+    6.77592645519925151909e-01,  /* 0x3fe5aed6c5909517 */
+    6.79959111179481823228e-01,  /* 0x3fe5c2399c244260 */
+    6.82316554874748071313e-01,  /* 0x3fe5d58987169b18 */
+    6.84665002047148862907e-01,  /* 0x3fe5e8c6941043cf */
+    6.87004478341244895212e-01,  /* 0x3fe5fbf0d0d5cc49 */
+    6.89335009598845749323e-01,  /* 0x3fe60f084b46e05e */
+    6.91656621853199760075e-01,  /* 0x3fe6220d115d7b8d */
+    6.93969341323259825138e-01,  /* 0x3fe634ff312d1f3b */
+    6.96273194408023488045e-01,  /* 0x3fe647deb8e20b8f */
+    6.98568207680949848637e-01,  /* 0x3fe65aabb6c07b02 */
+    7.00854407884450081312e-01,  /* 0x3fe66d663923e086 */
+    7.03131821924453670469e-01,  /* 0x3fe6800e4e7e2857 */
+    7.05400476865049030906e-01,  /* 0x3fe692a40556fb6a */
+    7.07660399923197958039e-01,  /* 0x3fe6a5276c4b0575 */
+    7.09911618463524796141e-01,  /* 0x3fe6b798920b3d98 */
+    7.12154159993178659249e-01,  /* 0x3fe6c9f7855c3198 */
+    7.14388052156768926793e-01,  /* 0x3fe6dc44551553ae */
+    7.16613322731374569052e-01,  /* 0x3fe6ee7f10204aef */
+    7.18829999621624415873e-01,  /* 0x3fe700a7c5784633 */
+    7.21038110854851588272e-01,  /* 0x3fe712be84295198 */
+    7.23237684576317874097e-01,  /* 0x3fe724c35b4fae7b */
+    7.25428749044510712274e-01,  /* 0x3fe736b65a172dff */
+    7.27611332626510676214e-01,  /* 0x3fe748978fba8e0f */
+    7.29785463793429123314e-01,  /* 0x3fe75a670b82d8d8 */
+    7.31951171115916565668e-01,  /* 0x3fe76c24dcc6c6c0 */
+    7.34108483259739652560e-01,  /* 0x3fe77dd112ea22c7 */
+    7.36257428981428097003e-01,  /* 0x3fe78f6bbd5d315e */
+    7.38398037123989547936e-01,  /* 0x3fe7a0f4eb9c19a2 */
+    7.40530336612692630105e-01,  /* 0x3fe7b26cad2e50fd */
+    7.42654356450917929600e-01,  /* 0x3fe7c3d311a6092b */
+    7.44770125716075148681e-01,  /* 0x3fe7d528289fa093 */
+    7.46877673555587429099e-01,  /* 0x3fe7e66c01c114fd */
+    7.48977029182941400620e-01,  /* 0x3fe7f79eacb97898 */
+    7.51068221873802288613e-01,  /* 0x3fe808c03940694a */
+    7.53151280962194302759e-01,  /* 0x3fe819d0b7158a4c */
+    7.55226235836744863583e-01,  /* 0x3fe82ad036000005 */
+    7.57293115936992444759e-01,  /* 0x3fe83bbec5cdee22 */
+    7.59351950749757920178e-01,  /* 0x3fe84c9c7653f7ea */
+    7.61402769805578416573e-01,  /* 0x3fe85d69576cc2c5 */
+    7.63445602675201784315e-01,  /* 0x3fe86e2578f87ae5 */
+    7.65480478966144461950e-01,  /* 0x3fe87ed0eadc5a2a */
+    7.67507428319308182552e-01,  /* 0x3fe88f6bbd023118 */
+    7.69526480405658186434e-01,  /* 0x3fe89ff5ff57f1f7 */
+    7.71537664922959498526e-01,  /* 0x3fe8b06fc1cf3dfe */
+    7.73541011592573490852e-01,  /* 0x3fe8c0d9145cf49d */
+    7.75536550156311621507e-01,  /* 0x3fe8d13206f8c4ca */
+    7.77524310373347682379e-01,  /* 0x3fe8e17aa99cc05d */
+    7.79504322017186335181e-01,  /* 0x3fe8f1b30c44f167 */
+    7.81476614872688268854e-01,  /* 0x3fe901db3eeef187 */
+    7.83441218733151756304e-01,  /* 0x3fe911f35199833b */
+    7.85398163397448278999e-01}; /* 0x3fe921fb54442d18 */
+
+  /* Some constants. */
+
+  static double pi = 3.1415926535897932e+00, /* 0x400921fb54442d18 */
+             piby2 = 1.5707963267948966e+00, /* 0x3ff921fb54442d18 */
+             piby4 = 7.8539816339744831e-01, /* 0x3fe921fb54442d18 */
+       three_piby4 = 2.3561944901923449e+00; /* 0x4002d97c7f3321d2 */
+
+  double u, v, vbyu, q, s, uu, r;
+  unsigned int swap_vu, index, xzero, yzero, xnan, ynan, xinf, yinf;
+  int xexp, yexp, diffexp;
+
+  double x = fx;
+  double y = fy;
+
+  /* Find properties of arguments x and y. */
+
+  unsigned long long ux, aux, xneg, uy, auy, yneg;
+
+  GET_BITS_DP64(x, ux);
+  GET_BITS_DP64(y, uy);
+  aux = ux & ~SIGNBIT_DP64;
+  auy = uy & ~SIGNBIT_DP64;
+  xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
+  yexp = (int)((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
+  xneg = ux & SIGNBIT_DP64;
+  yneg = uy & SIGNBIT_DP64;
+  xzero = (aux == 0);
+  yzero = (auy == 0);
+  xnan = (aux > PINFBITPATT_DP64);
+  ynan = (auy > PINFBITPATT_DP64);
+  xinf = (aux == PINFBITPATT_DP64);
+  yinf = (auy == PINFBITPATT_DP64);
+
+  diffexp = yexp - xexp;
+
+  /* Special cases */
+
+  if (xnan)
+#ifdef WINDOWS
+    {
+      unsigned int ufx;
+      GET_BITS_SP32(fx, ufx);
+      return handle_errorf("atan2f", ufx|0x00400000, _DOMAIN, 0, EDOM, fx, fy);
+    }
+#else
+    return fx + fx; /* Raise invalid if it's a signalling NaN */
+#endif
+  else if (ynan)
+#ifdef WINDOWS
+    {
+      unsigned int ufy;
+      GET_BITS_SP32(fy, ufy);
+      return handle_errorf("atan2f", ufy|0x00400000, _DOMAIN, 0, EDOM, fx, fy);
+    }
+#else
+    return (float)(y + y); /* Raise invalid if it's a signalling NaN */
+#endif
+  else if (yzero)
+    { /* Zero y gives +-0 for positive x
+         and +-pi for negative x */
+#ifndef WINDOWS
+      if ((_LIB_VERSION == _SVID_) && xzero)
+        /* Sigh - _SVID_ defines atan2(0,0) as a domain error */
+        return retval_errno_edom(x, y);
+      else
+#endif
+      if (xneg)
+	{
+	  if (yneg) return valf_with_flags((float)-pi, AMD_F_INEXACT);
+          else return valf_with_flags((float)pi, AMD_F_INEXACT);
+	}
+      else return (float)y;
+    }
+  else if (xzero)
+    { /* Zero x gives +- pi/2
+         depending on sign of y */
+      if (yneg) return valf_with_flags((float)-piby2, AMD_F_INEXACT);
+      else valf_with_flags((float)piby2, AMD_F_INEXACT);
+    }
+
+  if (diffexp > 26)
+    { /* abs(y)/abs(x) > 2^26 => arctan(x/y)
+         is insignificant compared to piby2 */
+      if (yneg) return valf_with_flags((float)-piby2, AMD_F_INEXACT);
+      else return valf_with_flags((float)piby2, AMD_F_INEXACT);
+    }
+  else if (diffexp < -13 && (!xneg))
+    { /* x positive and dominant over y by a factor of 2^13.
+         In this case atan(y/x) is y/x to machine accuracy. */
+
+      if (diffexp < -150) /* Result underflows */
+        {
+          if (yneg)
+            return valf_with_flags(-0.0F, AMD_F_INEXACT | AMD_F_UNDERFLOW);
+          else
+            return valf_with_flags(0.0F, AMD_F_INEXACT | AMD_F_UNDERFLOW);
+        }
+      else
+        {
+          if (diffexp < -126)
+            {
+              /* Result will likely be denormalized */
+              y = scaleDouble_1(y, 100);
+              y /= x;
+              /* Now y is 2^100 times the true result. Scale it back down. */
+              GET_BITS_DP64(y, uy);
+	      scaleDownDouble(uy, 100, &uy);
+              PUT_BITS_DP64(uy, y);
+	      if ((uy & EXPBITS_DP64) == 0)
+		return valf_with_flags((float)y, AMD_F_INEXACT | AMD_F_UNDERFLOW);
+	      else
+		return (float)y;
+             }
+          else
+            return (float)(y / x);
+        }
+    }
+  else if (diffexp < -26 && xneg)
+    { /* abs(x)/abs(y) > 2^56 and x < 0 => arctan(y/x)
+         is insignificant compared to pi */
+    if (yneg) return valf_with_flags((float)-pi, AMD_F_INEXACT);
+    else return valf_with_flags((float)pi, AMD_F_INEXACT);
+    }
+  else if (yinf && xinf)
+    { /* If abs(x) and abs(y) are both infinity
+         return +-pi/4 or +- 3pi/4 according to
+         signs.  */
+    if (xneg)
+      {
+      if (yneg) return valf_with_flags((float)-three_piby4, AMD_F_INEXACT);
+      else return valf_with_flags((float)three_piby4, AMD_F_INEXACT);
+      }
+    else
+      {
+      if (yneg) return valf_with_flags((float)-piby4, AMD_F_INEXACT);
+      else return valf_with_flags((float)piby4, AMD_F_INEXACT);
+      }
+    }
+
+  /* General case: take absolute values of arguments */
+
+  u = x; v = y;
+  if (xneg) u = -x;
+  if (yneg) v = -y;
+
+  /* Swap u and v if necessary to obtain 0 < v < u. Compute v/u. */
+
+  swap_vu = (u < v);
+  if (swap_vu) { uu = u; u = v; v = uu; }
+  vbyu = v/u;
+
+  if (vbyu > 0.0625)
+    { /* General values of v/u. Use a look-up
+         table and series expansion. */
+
+      index = (int)(256*vbyu + 0.5);
+      r = (256*v-index*u)/(256*u+index*v);
+
+      /* Polynomial approximation to atan(vbyu) */
+
+      s = r*r;
+      q = atan_jby256[index-16] + r - r*s*0.33333333333224095522;
+    }
+  else if (vbyu < 1.e-4)
+    { /* v/u is small enough that atan(v/u) = v/u */
+      q = vbyu;
+    }
+  else /* vbyu <= 0.0625 */
+    {
+      /* Small values of v/u. Use a series expansion */
+
+      s  = vbyu*vbyu;
+      q = vbyu -
+	vbyu*s*(0.33333333333333170500 -
+		s*(0.19999999999393223405 -
+		   s*0.14285713561807169030));
+    }
+
+  /* Tidy-up according to which quadrant the arguments lie in */
+
+  if (swap_vu) {q = piby2 - q;}
+  if (xneg) {q = pi - q;}
+  if (yneg) q = - q;
+  return (float)q;
+}
+
+weak_alias (__atan2f, atan2f)
diff --git a/src/atanf.c b/src/atanf.c
new file mode 100644
index 0000000..567dd87
--- /dev/null
+++ b/src/atanf.c
@@ -0,0 +1,170 @@
+
+/*
+*  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+*
+*  This file is part of libacml_mv.
+*
+*  libacml_mv is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  libacml_mv is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with libacml_mv.  If not, see
+*  <http://www.gnu.org/licenses/>.
+*
+*/
+
+
+#include "../inc/libm_amd.h"
+#include "../inc/libm_util_amd.h"
+
+#define USE_VALF_WITH_FLAGS
+#define USE_NAN_WITH_FLAGS
+#define USE_HANDLE_ERRORF
+#include "../inc/libm_inlines_amd.h"
+#undef USE_VALF_WITH_FLAGS
+#undef USE_NAN_WITH_FLAGS
+#undef USE_HANDLE_ERRORF
+
+#include "../inc/libm_errno_amd.h"
+
+#ifndef WINDOWS
+/* Deal with errno for out-of-range argument */
+static inline float retval_errno_edom(float x)
+{
+  struct exception exc;
+  exc.arg1 = (float)x;
+  exc.arg2 = (float)x;
+  exc.name = (char *)"atanf";
+  exc.type = DOMAIN;
+  if (_LIB_VERSION == _SVID_)
+    exc.retval = HUGE;
+  else
+    exc.retval = nan_with_flags(AMD_F_INVALID);
+  if (_LIB_VERSION == _POSIX_)
+    __set_errno(EDOM);
+  else if (!matherr(&exc))
+    {
+      if(_LIB_VERSION == _SVID_)
+        (void)fputs("atanf: DOMAIN error\n", stderr);
+    __set_errno(EDOM);
+    }
+  return exc.retval;
+}
+#endif
+
+#ifdef WINDOWS
+#pragma function(atanf)
+#endif
+
+float FN_PROTOTYPE(atanf)(float fx)
+{
+
+  /* Some constants and split constants. */
+
+  static double piby2 = 1.5707963267948966e+00; /* 0x3ff921fb54442d18 */
+
+  double c, v, s, q, z;
+  unsigned int xnan;
+
+  double x = fx;
+
+  /* Find properties of argument fx. */
+
+  unsigned long long ux, aux, xneg;
+
+  GET_BITS_DP64(x, ux);
+  aux = ux & ~SIGNBIT_DP64;
+  xneg = ux & SIGNBIT_DP64;
+
+  v = x;
+  if (xneg) v = -x;
+
+  /* Argument reduction to range [-7/16,7/16] */
+
+  if (aux < 0x3ec0000000000000) /* v < 2.0^(-19) */
+    {
+      /* x is a good approximation to atan(x) */
+      if (aux == 0x0000000000000000)
+        return fx;
+      else
+        return valf_with_flags(fx, AMD_F_INEXACT);
+    }
+  else if (aux < 0x3fdc000000000000) /* v < 7./16. */
+    {
+      x = v;
+      c = 0.0;
+    }
+  else if (aux < 0x3fe6000000000000) /* v < 11./16. */
+    {
+      x = (2.0*v-1.0)/(2.0+v);
+      /* c = arctan(0.5) */
+      c = 4.63647609000806093515e-01; /* 0x3fddac670561bb4f */
+    }
+  else if (aux < 0x3ff3000000000000) /* v < 19./16. */
+    {
+      x = (v-1.0)/(1.0+v);
+      /* c = arctan(1.) */
+      c = 7.85398163397448278999e-01; /* 0x3fe921fb54442d18 */
+    }
+  else if (aux < 0x4003800000000000) /* v < 39./16. */
+    {
+      x = (v-1.5)/(1.0+1.5*v);
+      /* c = arctan(1.5) */
+      c = 9.82793723247329054082e-01; /* 0x3fef730bd281f69b */
+    }
+  else
+    {
+
+      xnan = (aux > PINFBITPATT_DP64);
+
+      if (xnan)
+        {
+          /* x is NaN */
+#ifdef WINDOWS
+          unsigned int uhx;
+          GET_BITS_SP32(fx, uhx);
+          return handle_errorf("atanf", uhx|0x00400000, _DOMAIN,
+                               0, EDOM, fx, 0.0F);
+#else
+          return x + x; /* Raise invalid if it's a signalling NaN */
+#endif
+        }
+      else if (aux > 0x4190000000000000)
+	{ /* abs(x) > 2^26 => arctan(1/x) is
+	     insignificant compared to piby2 */
+	  if (xneg)
+            return valf_with_flags((float)-piby2, AMD_F_INEXACT);
+	  else
+            return valf_with_flags((float)piby2, AMD_F_INEXACT);
+	}
+
+      x = -1.0/v;
+      /* c = arctan(infinity) */
+      c = 1.57079632679489655800e+00; /* 0x3ff921fb54442d18 */
+    }
+
+  /* Core approximation: Remez(2,2) on [-7/16,7/16] */
+
+  s = x*x;
+  q = x*s*
+    (0.296528598819239217902158651186e0 +
+     (0.192324546402108583211697690500e0 +
+       0.470677934286149214138357545549e-2*s)*s)/
+    (0.889585796862432286486651434570e0 +
+     (0.111072499995399550138837673349e1 +
+       0.299309699959659728404442796915e0*s)*s);
+
+  z = c - (q - x);
+
+  if (xneg) z = -z;
+  return (float)z;
+}
+
+weak_alias (__atanf, atanf)
diff --git a/src/atanh.c b/src/atanh.c
new file mode 100644
index 0000000..5815ced
--- /dev/null
+++ b/src/atanh.c
@@ -0,0 +1,193 @@
+
+/*
+*  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+*
+*  This file is part of libacml_mv.
+*
+*  libacml_mv is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  libacml_mv is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with libacml_mv.  If not, see
+*  <http://www.gnu.org/licenses/>.
+*
+*/
+
+
+#include "../inc/libm_amd.h"
+#include "../inc/libm_util_amd.h"
+
+#define USE_NAN_WITH_FLAGS
+#define USE_VAL_WITH_FLAGS
+#define USE_INFINITY_WITH_FLAGS
+#define USE_HANDLE_ERROR
+#include "../inc/libm_inlines_amd.h"
+#undef USE_NAN_WITH_FLAGS
+#undef USE_VAL_WITH_FLAGS
+#undef USE_INFINITY_WITH_FLAGS
+#undef USE_HANDLE_ERROR
+
+#include "../inc/libm_errno_amd.h"
+
+#ifndef WINDOWS
+/* Deal with errno for out-of-range argument */
+static inline double retval_errno_edom(double x, double retval)
+{
+  struct exception exc;
+  exc.arg1 = x;
+  exc.arg2 = x;
+  exc.type = DOMAIN;
+  exc.name = (char *)"atanh";
+  if (_LIB_VERSION == _SVID_)
+    exc.retval = -HUGE;
+  else
+    exc.retval = retval;
+  if (_LIB_VERSION == _POSIX_)
+    __set_errno(EDOM);
+  else if (!matherr(&exc))
+    {
+      if(_LIB_VERSION == _SVID_)
+        (void)fputs("atanh: DOMAIN error\n", stderr);
+    __set_errno(EDOM);
+    }
+  return exc.retval;
+}
+#endif
+
+#undef _FUNCNAME
+#define _FUNCNAME "atanh"
+double FN_PROTOTYPE(atanh)(double x)
+{
+
+  unsigned long long ux, ax;
+  double r, absx, t, poly;
+
+
+  GET_BITS_DP64(x, ux);
+  ax = ux & ~SIGNBIT_DP64;
+  PUT_BITS_DP64(ax, absx);
+
+  if ((ux & EXPBITS_DP64) == EXPBITS_DP64)
+    {
+      /* x is either NaN or infinity */
+      if (ux & MANTBITS_DP64)
+        {
+          /* x is NaN */
+#ifdef WINDOWS
+          return handle_error(_FUNCNAME, ux|0x0008000000000000, _DOMAIN,
+                              AMD_F_INVALID, EDOM, x, 0.0);
+#else
+          return x + x; /* Raise invalid if it is a signalling NaN */
+#endif
+        }
+      else
+        {
+          /* x is infinity; return a NaN */
+#ifdef WINDOWS
+          return handle_error(_FUNCNAME, INDEFBITPATT_DP64, _DOMAIN,
+                              AMD_F_INVALID, EDOM, x, 0.0);
+#else
+          return retval_errno_edom(x,nan_with_flags(AMD_F_INVALID));
+#endif
+        }
+    }
+  else if (ax >= 0x3ff0000000000000)
+    {
+      if (ax > 0x3ff0000000000000)
+        {
+          /* abs(x) > 1.0; return NaN */
+#ifdef WINDOWS
+          return handle_error(_FUNCNAME, INDEFBITPATT_DP64, _DOMAIN,
+                              AMD_F_INVALID, EDOM, x, 0.0);
+#else
+          return retval_errno_edom(x,nan_with_flags(AMD_F_INVALID));
+#endif
+        }
+      else if (ux == 0x3ff0000000000000)
+        {
+          /* x = +1.0; return infinity with the same sign as x
+             and set the divbyzero status flag */
+#ifdef WINDOWS
+          return handle_error(_FUNCNAME, PINFBITPATT_DP64, _DOMAIN,
+                              AMD_F_INVALID, EDOM, x, 0.0);
+#else
+          return retval_errno_edom(x,infinity_with_flags(AMD_F_DIVBYZERO));
+#endif
+        }
+      else
+        {
+          /* x = -1.0; return infinity with the same sign as x */
+#ifdef WINDOWS
+          return handle_error(_FUNCNAME, NINFBITPATT_DP64, _DOMAIN,
+                              AMD_F_INVALID, EDOM, x, 0.0);
+#else
+          return retval_errno_edom(x,-infinity_with_flags(AMD_F_DIVBYZERO));
+#endif
+        }
+    }
+
+
+  if (ax < 0x3e30000000000000)
+    {
+      if (ax == 0x0000000000000000)
+        {
+          /* x is +/-zero. Return the same zero. */
+          return x;
+        }
+      else
+        {
+          /* Arguments smaller than 2^(-28) in magnitude are
+             approximated by atanh(x) = x, raising inexact flag. */
+          return val_with_flags(x, AMD_F_INEXACT);
+        }
+    }
+  else
+    {
+      if (ax < 0x3fe0000000000000)
+        {
+          /* Arguments up to 0.5 in magnitude are
+             approximated by a [5,5] minimax polynomial */
+          t = x*x;
+          poly =
+            (0.47482573589747356373e0 +
+             (-0.11028356797846341457e1 +
+              (0.88468142536501647470e0 +
+               (-0.28180210961780814148e0 +
+                (0.28728638600548514553e-1 -
+                 0.10468158892753136958e-3 * t) * t) * t) * t) * t) /
+            (0.14244772076924206909e1 +
+             (-0.41631933639693546274e1 +
+              (0.45414700626084508355e1 +
+               (-0.22608883748988489342e1 +
+                (0.49561196555503101989e0 -
+                 0.35861554370169537512e-1 * t) * t) * t) * t) * t);
+          return x + x*t*poly;
+        }
+      else
+        {
+          /* abs(x) >= 0.5 */
+          /* Note that
+               atanh(x) = 0.5 * ln((1+x)/(1-x))
+             (see Abramowitz and Stegun 4.6.22).
+             For greater accuracy we use the variant formula
+             atanh(x) = log(1 + 2x/(1-x)) = log1p(2x/(1-x)).
+          */
+          r = (2.0 * absx) / (1.0 - absx);
+          r = 0.5 * FN_PROTOTYPE(log1p)(r);
+          if (ux & SIGNBIT_DP64)
+            /* Argument x is negative */
+            return -r;
+          else
+            return r;
+        }
+    }
+}
+
+weak_alias (__atanh, atanh)
diff --git a/src/atanhf.c b/src/atanhf.c
new file mode 100644
index 0000000..38692b4
--- /dev/null
+++ b/src/atanhf.c
@@ -0,0 +1,194 @@
+
+/*
+*  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+*
+*  This file is part of libacml_mv.
+*
+*  libacml_mv is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  libacml_mv is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with libacml_mv.  If not, see
+*  <http://www.gnu.org/licenses/>.
+*
+*/
+
+
+#include "../inc/libm_amd.h"
+#include "../inc/libm_util_amd.h"
+
+#include <stdio.h>
+
+#define USE_NANF_WITH_FLAGS
+#define USE_VALF_WITH_FLAGS
+#define USE_INFINITYF_WITH_FLAGS
+#define USE_HANDLE_ERRORF
+#include "../inc/libm_inlines_amd.h"
+#undef USE_NANF_WITH_FLAGS
+#undef USE_VALF_WITH_FLAGS
+#undef USE_INFINITYF_WITH_FLAGS
+#undef USE_HANDLE_ERRORF
+
+#include "../inc/libm_errno_amd.h"
+
+#ifndef WINDOWS
+/* Deal with errno for out-of-range argument */
+static inline float retval_errno_edom(float x, float retval)
+{
+  struct exception exc;
+  exc.arg1 = (double)x;
+  exc.arg2 = (double)x;
+  exc.type = DOMAIN;
+  exc.name = (char *)"atanhf";
+  if (_LIB_VERSION == _SVID_)
+    exc.retval = -HUGE;
+  else
+    exc.retval = (double)retval;
+  if (_LIB_VERSION == _POSIX_)
+    __set_errno(EDOM);
+  else if (!matherr(&exc))
+    {
+      if(_LIB_VERSION == _SVID_)
+        (void)fputs("atanhf: DOMAIN error\n", stderr);
+    __set_errno(EDOM);
+    }
+  return exc.retval;
+}
+#endif
+
+#undef _FUNCNAME
+#define _FUNCNAME "atanhf"
+float FN_PROTOTYPE(atanhf)(float x)
+{
+
+  double dx;
+  unsigned int ux, ax;
+  double r, t, poly;
+
+  GET_BITS_SP32(x, ux);
+  ax = ux & ~SIGNBIT_SP32;
+
+  if ((ux & EXPBITS_SP32) == EXPBITS_SP32)
+    {
+      /* x is either NaN or infinity */
+      if (ux & MANTBITS_SP32)
+        {
+          /* x is NaN */
+#ifdef WINDOWS
+          return handle_errorf(_FUNCNAME, ux|0x00400000, _DOMAIN,
+                              0, EDOM, x, 0.0F);
+#else
+          return x + x; /* Raise invalid if it is a signalling NaN */
+#endif
+        }
+      else
+        {
+          /* x is infinity; return a NaN */
+#ifdef WINDOWS
+          return handle_errorf(_FUNCNAME, INDEFBITPATT_SP32, _DOMAIN,
+                               AMD_F_INVALID, EDOM, x, 0.0F);
+#else
+          return retval_errno_edom(x,nanf_with_flags(AMD_F_INVALID));
+#endif
+        }
+    }
+  else if (ax >= 0x3f800000)
+    {
+      if (ax > 0x3f800000)
+        {
+          /* abs(x) > 1.0; return NaN */
+#ifdef WINDOWS
+          return handle_errorf(_FUNCNAME, INDEFBITPATT_SP32, _DOMAIN,
+                               AMD_F_INVALID, EDOM, x, 0.0F);
+#else
+          return retval_errno_edom(x,nanf_with_flags(AMD_F_INVALID));
+#endif
+        }
+      else if (ux == 0x3f800000)
+        {
+          /* x = +1.0; return infinity with the same sign as x
+             and set the divbyzero status flag */
+#ifdef WINDOWS
+          return handle_errorf(_FUNCNAME, PINFBITPATT_SP32, _DOMAIN,
+                               AMD_F_INVALID, EDOM, x, 0.0F);
+#else
+          return retval_errno_edom(x,infinityf_with_flags(AMD_F_DIVBYZERO));
+#endif
+        }
+      else
+        {
+          /* x = -1.0; return infinity with the same sign as x */
+#ifdef WINDOWS
+          return handle_errorf(_FUNCNAME, NINFBITPATT_SP32, _DOMAIN,
+                               AMD_F_INVALID, EDOM, x, 0.0F);
+#else
+          return retval_errno_edom(x,-infinityf_with_flags(AMD_F_DIVBYZERO));
+#endif
+        }
+    }
+
+  if (ax < 0x39000000)
+    {
+      if (ax == 0x00000000)
+        {
+          /* x is +/-zero. Return the same zero. */
+          return x;
+        }
+      else
+        {
+          /* Arguments smaller than 2^(-13) in magnitude are
+             approximated by atanhf(x) = x, raising inexact flag. */
+          return valf_with_flags(x, AMD_F_INEXACT);
+        }
+    }
+  else
+    {
+      dx = x;
+      if (ax < 0x3f000000)
+        {
+          /* Arguments up to 0.5 in magnitude are
+             approximated by a [2,2] minimax polynomial */
+          t = dx*dx;
+          poly =
+            (0.39453629046e0 +
+           (-0.28120347286e0 +
+             0.92834212715e-2 * t) * t) /
+            (0.11836088638e1 + 
+           (-0.15537744551e1 +
+             0.45281890445e0 * t) * t);
+          return (float)(dx + dx*t*poly);
+        }
+      else
+        {
+          /* abs(x) >= 0.5 */
+          /* Note that
+               atanhf(x) = 0.5 * ln((1+x)/(1-x))
+             (see Abramowitz and Stegun 4.6.22).
+             For greater accuracy we use the variant formula
+             atanhf(x) = log(1 + 2x/(1-x)) = log1p(2x/(1-x)).
+          */
+          if (ux & SIGNBIT_SP32)
+            {
+              /* Argument x is negative */
+              r = (-2.0 * dx) / (1.0 + dx);
+              r = 0.5 * FN_PROTOTYPE(log1p)(r);
+              return (float)-r;
+            }
+          else
+            {
+              r = (2.0 * dx) / (1.0 - dx);
+              r = 0.5 * FN_PROTOTYPE(log1p)(r);
+              return (float)r;
+            }
+        }
+    }
+}
+
+weak_alias (__atanhf, atanhf)
diff --git a/src/ceil.c b/src/ceil.c
new file mode 100644
index 0000000..94ef21d
--- /dev/null
+++ b/src/ceil.c
@@ -0,0 +1,104 @@
+
+/*
+*  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+*
+*  This file is part of libacml_mv.
+*
+*  libacml_mv is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  libacml_mv is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with libacml_mv.  If not, see
+*  <http://www.gnu.org/licenses/>.
+*
+*/
+
+
+#include "../inc/libm_amd.h"
+#include "../inc/libm_util_amd.h"
+
+#ifdef WINDOWS
+#include "../inc/libm_errno_amd.h"
+#define USE_HANDLE_ERROR
+#include "../inc/libm_inlines_amd.h"
+#undef USE_HANDLE_ERROR
+#endif
+
+#ifdef WINDOWS
+#pragma function(ceil)
+#endif
+
+double FN_PROTOTYPE(ceil)(double x)
+{
+  double r;
+  long long rexp, xneg;
+  unsigned long long ux, ax, ur, mask;
+
+  GET_BITS_DP64(x, ux);
+  /*ax is |x|*/
+  ax = ux & (~SIGNBIT_DP64);
+  /*xneg stores the sign of the input x*/
+  xneg = (ux != ax);
+  /*The range is divided into 
+    > 2^53. ceil will either the number itself or Nan
+            always returns a QNan. Raises exception if input is a SNan
+    < 1.0   If 0.0 then return with the appropriate sign
+            If input is less than -0.0 and greater than -1.0 then return -0.0
+            If input is greater than 0.0 and less than 1.0 then return 1.0
+    1.0 < |x| < 2^53 
+            appropriately check the exponent and set the return Value by shifting
+            */
+  if (ax >= 0x4340000000000000) /* abs(x) > 2^53*/
+    {
+      /* abs(x) is either NaN, infinity, or >= 2^53 */
+      if (ax > 0x7ff0000000000000)
+        /* x is NaN */
+#ifdef WINDOWS
+        return handle_error("ceil", ux|0x0008000000000000, _DOMAIN, 0,
+                            EDOM, x, 0.0);
+#else
+        return x + x; /* Raise invalid if it is a signalling NaN */
+#endif
+      else
+        return x;
+    }
+  else if (ax < 0x3ff0000000000000) /* abs(x) < 1.0 */
+    {
+      if (ax == 0x0000000000000000)
+        /* x is +zero or -zero; return the same zero */
+          return x;
+      else if (xneg) /* x < 0.0; return -0.0 */
+        {
+          PUT_BITS_DP64(0x8000000000000000, r);
+          return r;
+        }
+      else
+        return 1.0;
+    }
+  else
+    {
+      /*Get the exponent for the floating point number. Should be between 0 and 53*/
+      rexp = ((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64;
+      /* Mask out the bits of r that we don't want */
+      mask = 1;
+      mask = (mask << (EXPSHIFTBITS_DP64 - rexp)) - 1;
+      /*Keeps the exponent part and the required mantissa.*/
+      ur = (ux & ~mask);
+      PUT_BITS_DP64(ur, r);
+      if (xneg || (ur == ux))
+        return r;
+      else
+        /* We threw some bits away and x was positive */
+        return r + 1.0;
+    }
+
+}
+
+weak_alias (__ceil, ceil)
diff --git a/src/ceilf.c b/src/ceilf.c
new file mode 100644
index 0000000..56d0c37
--- /dev/null
+++ b/src/ceilf.c
@@ -0,0 +1,97 @@
+
+/*
+*  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+*
+*  This file is part of libacml_mv.
+*
+*  libacml_mv is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  libacml_mv is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with libacml_mv.  If not, see
+*  <http://www.gnu.org/licenses/>.
+*
+*/
+
+
+#include "../inc/libm_amd.h"
+#include "../inc/libm_util_amd.h"
+
+#ifdef WINDOWS
+#include "../inc/libm_errno_amd.h"
+#define USE_HANDLE_ERRORF
+#include "../inc/libm_inlines_amd.h"
+#undef USE_HANDLE_ERRORF
+#endif
+
+#ifdef WINDOWS
+#pragma function(ceilf)
+#endif
+
+float FN_PROTOTYPE(ceilf)(float x)
+{
+  float r;
+  int rexp, xneg;
+  unsigned int ux, ax, ur, mask;
+
+  GET_BITS_SP32(x, ux);
+  /*ax is |x|*/
+  ax = ux & (~SIGNBIT_SP32);
+  /*xneg stores the sign of the input x*/
+  xneg = (ux != ax);
+  /*The range is divided into 
+    > 2^24. ceil will either the number itself or Nan
+            always returns a QNan. Raises exception if input is a SNan
+    < 1.0   If 0.0 then return with the appropriate sign
+            If input is less than -0.0 and greater than -1.0 then return -0.0
+            If input is greater than 0.0 and less than 1.0 then return 1.0
+    1.0 < |x| < 2^24 
+            appropriately check the exponent and set the return Value by shifting
+            */
+  if (ax >= 0x4b800000) /* abs(x) > 2^24*/
+    {
+      /* abs(x) is either NaN, infinity, or >= 2^24 */
+      if (ax > 0x7f800000)
+        /* x is NaN */
+#ifdef WINDOWS
+        return handle_errorf("ceilf", ux, _DOMAIN, 0, EDOM, x, 0.0F);
+#else
+        return x + x; /* Raise invalid if it is a signalling NaN */
+#endif
+      else
+        return x;
+    }
+  else if (ax < 0x3f800000) /* abs(x) < 1.0 */
+    {
+      if (ax == 0x00000000)
+        /* x is +zero or -zero; return the same zero */
+        return x;
+      else if (xneg) /* x < 0.0 */
+        return -0.0F;
+      else
+        return 1.0F;
+    }
+  else
+    {
+      rexp = ((ux & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+      /* Mask out the bits of r that we don't want */
+      mask = (1 << (EXPSHIFTBITS_SP32 - rexp)) - 1;
+      /*Keeps the exponent part and the required mantissa.*/
+      ur = (ux & ~mask);
+      PUT_BITS_SP32(ur, r);
+
+      if (xneg || (ux == ur)) return r;
+      else
+        /* We threw some bits away and x was positive */
+        return r + 1.0F;
+    }
+}
+
+weak_alias (__ceilf, ceilf)
diff --git a/src/cosh.c b/src/cosh.c
new file mode 100644
index 0000000..6f8734b
--- /dev/null
+++ b/src/cosh.c
@@ -0,0 +1,359 @@
+
+/*
+*  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+*
+*  This file is part of libacml_mv.
+*
+*  libacml_mv is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  libacml_mv is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with libacml_mv.  If not, see
+*  <http://www.gnu.org/licenses/>.
+*
+*/
+
+
+
+#include "../inc/libm_amd.h"
+#include "../inc/libm_util_amd.h"
+
+#define USE_SPLITEXP
+#define USE_SCALEDOUBLE_1
+#define USE_SCALEDOUBLE_2
+#define USE_INFINITY_WITH_FLAGS
+#define USE_VAL_WITH_FLAGS
+#define USE_HANDLE_ERROR
+#include "../inc/libm_inlines_amd.h"
+#undef USE_HANDLE_ERROR
+#undef USE_SPLITEXP
+#undef USE_SCALEDOUBLE_1
+#undef USE_SCALEDOUBLE_2
+#undef USE_INFINITY_WITH_FLAGS
+#undef USE_VAL_WITH_FLAGS
+
+#include "../inc/libm_errno_amd.h"
+#ifndef WINDOWS
+/* Deal with errno for out-of-range result */
+static inline double retval_errno_erange(double x)
+{
+  struct exception exc;
+  exc.arg1 = x;
+  exc.arg2 = x;
+  exc.type = OVERFLOW;
+  exc.name = (char *)"cosh";
+  if (_LIB_VERSION == _SVID_)
+    {
+        exc.retval = HUGE;
+    }
+  else
+    {
+        exc.retval = infinity_with_flags(AMD_F_OVERFLOW);
+    }
+  if (_LIB_VERSION == _POSIX_)
+    __set_errno(ERANGE);
+  else if (!matherr(&exc))
+    __set_errno(ERANGE);
+  return exc.retval;
+}
+#endif
+
+double FN_PROTOTYPE(cosh)(double x)
+{
+  /*
+    Derived from sinh subroutine
+    
+    After dealing with special cases the computation is split into
+    regions as follows:
+
+    abs(x) >= max_cosh_arg:
+    cosh(x) = sign(x)*Inf
+
+    abs(x) >= small_threshold:
+    cosh(x) = sign(x)*exp(abs(x))/2 computed using the
+    splitexp and scaleDouble functions as for exp_amd().
+
+    abs(x) < small_threshold:
+    compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0)))
+    cosh(x) is then sign(x)*z.                             */
+
+  static const double
+    max_cosh_arg = 7.10475860073943977113e+02, /* 0x408633ce8fb9f87e */
+    thirtytwo_by_log2 = 4.61662413084468283841e+01, /* 0x40471547652b82fe */
+    log2_by_32_lead = 2.16608493356034159660e-02, /* 0x3f962e42fe000000 */
+    log2_by_32_tail = 5.68948749532545630390e-11, /* 0x3dcf473de6af278e */
+//    small_threshold = 8*BASEDIGITS_DP64*0.30102999566398119521373889;
+    small_threshold = 20.0;
+  /* (8*BASEDIGITS_DP64*log10of2) ' exp(-x) insignificant c.f. exp(x) */
+
+  /* Lead and tail tabulated values of sinh(i) and cosh(i) 
+     for i = 0,...,36. The lead part has 26 leading bits. */
+
+  static const double sinh_lead[   37] = {
+    0.00000000000000000000e+00,  /* 0x0000000000000000 */
+    1.17520117759704589844e+00,  /* 0x3ff2cd9fc0000000 */
+    3.62686038017272949219e+00,  /* 0x400d03cf60000000 */
+    1.00178747177124023438e+01,  /* 0x40240926e0000000 */
+    2.72899169921875000000e+01,  /* 0x403b4a3800000000 */
+    7.42032089233398437500e+01,  /* 0x40528d0160000000 */
+    2.01713153839111328125e+02,  /* 0x406936d228000000 */
+    5.48316116333007812500e+02,  /* 0x4081228768000000 */
+    1.49047882080078125000e+03,  /* 0x409749ea50000000 */
+    4.05154187011718750000e+03,  /* 0x40afa71570000000 */
+    1.10132326660156250000e+04,  /* 0x40c5829dc8000000 */
+    2.99370708007812500000e+04,  /* 0x40dd3c4488000000 */
+    8.13773945312500000000e+04,  /* 0x40f3de1650000000 */
+    2.21206695312500000000e+05,  /* 0x410b00b590000000 */
+    6.01302140625000000000e+05,  /* 0x412259ac48000000 */
+    1.63450865625000000000e+06,  /* 0x4138f0cca8000000 */
+    4.44305525000000000000e+06,  /* 0x4150f2ebd0000000 */
+    1.20774762500000000000e+07,  /* 0x4167093488000000 */
+    3.28299845000000000000e+07,  /* 0x417f4f2208000000 */
+    8.92411500000000000000e+07,  /* 0x419546d8f8000000 */
+    2.42582596000000000000e+08,  /* 0x41aceb0888000000 */
+    6.59407856000000000000e+08,  /* 0x41c3a6e1f8000000 */
+    1.79245641600000000000e+09,  /* 0x41dab5adb8000000 */
+    4.87240166400000000000e+09,  /* 0x41f226af30000000 */
+    1.32445608960000000000e+10,  /* 0x4208ab7fb0000000 */
+    3.60024494080000000000e+10,  /* 0x4220c3d390000000 */
+    9.78648043520000000000e+10,  /* 0x4236c93268000000 */
+    2.66024116224000000000e+11,  /* 0x424ef822f0000000 */
+    7.23128516608000000000e+11,  /* 0x42650bba30000000 */
+    1.96566712320000000000e+12,  /* 0x427c9aae40000000 */
+    5.34323724288000000000e+12,  /* 0x4293704708000000 */
+    1.45244246507520000000e+13,  /* 0x42aa6b7658000000 */
+    3.94814795284480000000e+13,  /* 0x42c1f43fc8000000 */
+    1.07321789251584000000e+14,  /* 0x42d866f348000000 */
+    2.91730863685632000000e+14,  /* 0x42f0953e28000000 */
+    7.93006722514944000000e+14,  /* 0x430689e220000000 */
+    2.15561576592179200000e+15}; /* 0x431ea215a0000000 */
+
+  static const double sinh_tail[   37] = {
+    0.00000000000000000000e+00,  /* 0x0000000000000000 */
+    1.60467555584448807892e-08,  /* 0x3e513ae6096a0092 */
+    2.76742892754807136947e-08,  /* 0x3e5db70cfb79a640 */
+    2.09697499555224576530e-07,  /* 0x3e8c2526b66dc067 */
+    2.04940252448908240062e-07,  /* 0x3e8b81b18647f380 */
+    1.65444891522700935932e-06,  /* 0x3ebbc1cdd1e1eb08 */
+    3.53116789999998198721e-06,  /* 0x3ecd9f201534fb09 */
+    6.94023870987375490695e-06,  /* 0x3edd1c064a4e9954 */
+    4.98876893611587449271e-06,  /* 0x3ed4eca65d06ea74 */
+    3.19656024605152215752e-05,  /* 0x3f00c259bcc0ecc5 */
+    2.08687768377236501204e-04,  /* 0x3f2b5a6647cf9016 */
+    4.84668088325403796299e-05,  /* 0x3f09691adefb0870 */
+    1.17517985422733832468e-03,  /* 0x3f53410fc29cde38 */
+    6.90830086959560562415e-04,  /* 0x3f46a31a50b6fb3c */
+    1.45697262451506548420e-03,  /* 0x3f57defc71805c40 */
+    2.99859023684906737806e-02,  /* 0x3f9eb49fd80e0bab */
+    1.02538800507941396667e-02,  /* 0x3f84fffc7bcd5920 */
+    1.26787628407699110022e-01,  /* 0x3fc03a93b6c63435 */
+    6.86652479544033744752e-02,  /* 0x3fb1940bb255fd1c */
+    4.81593627621056619148e-01,  /* 0x3fded26e14260b50 */
+    1.70489513795397629181e+00,  /* 0x3ffb47401fc9f2a2 */
+    1.12416073482258713767e+01,  /* 0x40267bb3f55634f1 */
+    7.06579578070110514432e+00,  /* 0x401c435ff8194ddc */
+    5.91244512999659974639e+01,  /* 0x404d8fee052ba63a */
+    1.68921736147050694399e+02,  /* 0x40651d7edccde3f6 */
+    2.60692936262073658327e+02,  /* 0x40704b1644557d1a */
+    3.62419382134885609048e+02,  /* 0x4076a6b5ca0a9dc4 */
+    4.07689930834187271103e+03,  /* 0x40afd9cc72249aba */
+    1.55377375868385224749e+04,  /* 0x40ce58de693edab5 */
+    2.53720210371943067003e+04,  /* 0x40d8c70158ac6363 */
+    4.78822310734952334315e+04,  /* 0x40e7614764f43e20 */
+    1.81871712615542812273e+05,  /* 0x4106337db36fc718 */
+    5.62892347580489004031e+05,  /* 0x41212d98b1f611e2 */
+    6.41374032312148716301e+05,  /* 0x412392bc108b37cc */
+    7.57809544070145115256e+06,  /* 0x415ce87bdc3473dc */
+    3.64177136406482197344e+06,  /* 0x414bc8d5ae99ad14 */
+    7.63580561355670914054e+06}; /* 0x415d20d76744835c */
+
+  static const double cosh_lead[   37] = {
+    1.00000000000000000000e+00,  /* 0x3ff0000000000000 */
+    1.54308062791824340820e+00,  /* 0x3ff8b07550000000 */
+    3.76219564676284790039e+00,  /* 0x400e18fa08000000 */
+    1.00676617622375488281e+01,  /* 0x402422a490000000 */
+    2.73082327842712402344e+01,  /* 0x403b4ee858000000 */
+    7.42099475860595703125e+01,  /* 0x40528d6fc8000000 */
+    2.01715633392333984375e+02,  /* 0x406936e678000000 */
+    5.48317031860351562500e+02,  /* 0x4081228948000000 */
+    1.49047915649414062500e+03,  /* 0x409749eaa8000000 */
+    4.05154199218750000000e+03,  /* 0x40afa71580000000 */
+    1.10132329101562500000e+04,  /* 0x40c5829dd0000000 */
+    2.99370708007812500000e+04,  /* 0x40dd3c4488000000 */
+    8.13773945312500000000e+04,  /* 0x40f3de1650000000 */
+    2.21206695312500000000e+05,  /* 0x410b00b590000000 */
+    6.01302140625000000000e+05,  /* 0x412259ac48000000 */
+    1.63450865625000000000e+06,  /* 0x4138f0cca8000000 */
+    4.44305525000000000000e+06,  /* 0x4150f2ebd0000000 */
+    1.20774762500000000000e+07,  /* 0x4167093488000000 */
+    3.28299845000000000000e+07,  /* 0x417f4f2208000000 */
+    8.92411500000000000000e+07,  /* 0x419546d8f8000000 */
+    2.42582596000000000000e+08,  /* 0x41aceb0888000000 */
+    6.59407856000000000000e+08,  /* 0x41c3a6e1f8000000 */
+    1.79245641600000000000e+09,  /* 0x41dab5adb8000000 */
+    4.87240166400000000000e+09,  /* 0x41f226af30000000 */
+    1.32445608960000000000e+10,  /* 0x4208ab7fb0000000 */
+    3.60024494080000000000e+10,  /* 0x4220c3d390000000 */
+    9.78648043520000000000e+10,  /* 0x4236c93268000000 */
+    2.66024116224000000000e+11,  /* 0x424ef822f0000000 */
+    7.23128516608000000000e+11,  /* 0x42650bba30000000 */
+    1.96566712320000000000e+12,  /* 0x427c9aae40000000 */
+    5.34323724288000000000e+12,  /* 0x4293704708000000 */
+    1.45244246507520000000e+13,  /* 0x42aa6b7658000000 */
+    3.94814795284480000000e+13,  /* 0x42c1f43fc8000000 */
+    1.07321789251584000000e+14,  /* 0x42d866f348000000 */
+    2.91730863685632000000e+14,  /* 0x42f0953e28000000 */
+    7.93006722514944000000e+14,  /* 0x430689e220000000 */
+    2.15561576592179200000e+15}; /* 0x431ea215a0000000 */
+
+  static const double cosh_tail[   37] = {
+    0.00000000000000000000e+00,  /* 0x0000000000000000 */
+    6.89700037027478056904e-09,  /* 0x3e3d9f5504c2bd28 */
+    4.43207835591715833630e-08,  /* 0x3e67cb66f0a4c9fd */
+    2.33540217013828929694e-07,  /* 0x3e8f58617928e588 */
+    5.17452463948269748331e-08,  /* 0x3e6bc7d000c38d48 */
+    9.38728274131605919153e-07,  /* 0x3eaf7f9d4e329998 */
+    2.73012191010840495544e-06,  /* 0x3ec6e6e464885269 */
+    3.29486051438996307950e-06,  /* 0x3ecba3a8b946c154 */
+    4.75803746362771416375e-06,  /* 0x3ed3f4e76110d5a4 */
+    3.33050940471947692369e-05,  /* 0x3f017622515a3e2b */
+    9.94707313972136215365e-06,  /* 0x3ee4dc4b528af3d0 */
+    6.51685096227860253398e-05,  /* 0x3f11156278615e10 */
+    1.18132406658066663359e-03,  /* 0x3f535ad50ed821f5 */
+    6.93090416366541877541e-04,  /* 0x3f46b61055f2935c */
+    1.45780415323416845386e-03,  /* 0x3f57e2794a601240 */
+    2.99862082708111758744e-02,  /* 0x3f9eb4b45f6aadd3 */
+    1.02539925859688602072e-02,  /* 0x3f85000b967b3698 */
+    1.26787669807076286421e-01,  /* 0x3fc03a940fadc092 */
+    6.86652631843830962843e-02,  /* 0x3fb1940bf3bf874c */
+    4.81593633223853068159e-01,  /* 0x3fded26e1a2a2110 */
+    1.70489514001513020602e+00,  /* 0x3ffb4740205796d6 */
+    1.12416073489841270572e+01,  /* 0x40267bb3f55cb85d */
+    7.06579578098005001152e+00,  /* 0x401c435ff81e18ac */
+    5.91244513000686140458e+01,  /* 0x404d8fee052bdea4 */
+    1.68921736147088438429e+02,  /* 0x40651d7edccde926 */
+    2.60692936262087528121e+02,  /* 0x40704b1644557e0e */
+    3.62419382134890611269e+02,  /* 0x4076a6b5ca0a9e1c */
+    4.07689930834187453002e+03,  /* 0x40afd9cc72249abe */
+    1.55377375868385224749e+04,  /* 0x40ce58de693edab5 */
+    2.53720210371943103382e+04,  /* 0x40d8c70158ac6364 */
+    4.78822310734952334315e+04,  /* 0x40e7614764f43e20 */
+    1.81871712615542812273e+05,  /* 0x4106337db36fc718 */
+    5.62892347580489004031e+05,  /* 0x41212d98b1f611e2 */
+    6.41374032312148716301e+05,  /* 0x412392bc108b37cc */
+    7.57809544070145115256e+06,  /* 0x415ce87bdc3473dc */
+    3.64177136406482197344e+06,  /* 0x414bc8d5ae99ad14 */
+    7.63580561355670914054e+06}; /* 0x415d20d76744835c */
+
+  unsigned long long ux, aux, xneg;
+  double y, z, z1, z2;
+  int m;
+
+  /* Special cases */
+
+  GET_BITS_DP64(x, ux);
+  aux = ux & ~SIGNBIT_DP64;
+  if (aux < 0x3e30000000000000) /* |x| small enough that cosh(x) = 1 */
+  {
+      if (aux == 0)
+        /* with no inexact */
+        return 1.0;
+      else
+        return val_with_flags(1.0, AMD_F_INEXACT);
+  }
+  else if (aux >= PINFBITPATT_DP64) /* |x| is NaN or Inf */
+    {
+      if (aux > PINFBITPATT_DP64) /* |x| is a NaN? */
+         return x + x;
+      else    /* x is infinity */
+         return infinity_with_flags(0);
+    }
+
+  xneg = (aux != ux);
+
+  y = x;
+  if (xneg) y = -x;
+
+  if (y >= max_cosh_arg)
+      {
+      /* Return +/-infinity with overflow flag */
+#ifdef WINDOWS
+         return handle_error("cosh", PINFBITPATT_DP64, _OVERFLOW,
+                              AMD_F_OVERFLOW, EDOM, x, 0.0F);
+#else
+      return retval_errno_erange(x);
+#endif
+
+
+      }
+  else if (y >= small_threshold)
+    {
+      /* In this range y is large enough so that
+         the negative exponential is negligible,
+         so cosh(y) is approximated by sign(x)*exp(y)/2. The
+         code below is an inlined version of that from
+         exp() with two changes (it operates on
+         y instead of x, and the division by 2 is
+         done by reducing m by 1). */
+
+      splitexp(y, 1.0, thirtytwo_by_log2, log2_by_32_lead,
+               log2_by_32_tail, &m, &z1, &z2);
+      m -= 1;
+
+      if (m >= EMIN_DP64 && m <= EMAX_DP64)
+        z = scaleDouble_1((z1+z2),m);
+      else
+        z = scaleDouble_2((z1+z2),m);
+    }
+  else
+    {
+      /* In this range we find the integer part y0 of y 
+         and the increment dy = y - y0. We then compute
+ 
+         z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy)
+         z = cosh(y) = cosh(y0)cosh(dy) + sinh(y0)sinh(dy)
+
+         where sinh(y0) and cosh(y0) are tabulated above. */
+
+      int ind;
+      double dy, dy2, sdy, cdy;
+
+      ind = (int)y;
+      dy = y - ind;
+
+      dy2 = dy*dy;
+      sdy = dy*dy2*(0.166666666666666667013899e0 +
+                    (0.833333333333329931873097e-2 +
+                     (0.198412698413242405162014e-3 +
+                      (0.275573191913636406057211e-5 +
+                       (0.250521176994133472333666e-7 +
+                        (0.160576793121939886190847e-9 +
+                         0.7746188980094184251527126e-12*dy2)*dy2)*dy2)*dy2)*dy2)*dy2);
+
+      cdy = dy2*(0.500000000000000005911074e0 +
+                 (0.416666666666660876512776e-1 +
+                  (0.138888888889814854814536e-2 +
+                   (0.248015872460622433115785e-4 +
+                    (0.275573350756016588011357e-6 +
+                     (0.208744349831471353536305e-8 +
+                      0.1163921388172173692062032e-10*dy2)*dy2)*dy2)*dy2)*dy2)*dy2);
+
+      /* At this point sinh(dy) is approximated by dy + sdy, and cosh(dy) is approximated by 1 + cdy.
+	 Shift some significant bits from dy to cdy. */
+      z = ((((((cosh_tail[ind]*cdy + sinh_tail[ind]*sdy) 
+	       + sinh_tail[ind]*dy) + cosh_tail[ind])  
+	     + cosh_lead[ind]*cdy) + sinh_lead[ind]*sdy) 
+	   + sinh_lead[ind]*dy) + cosh_lead[ind];
+    }
+
+  return z;
+}
+
+weak_alias (__cosh, cosh)
diff --git a/src/coshf.c b/src/coshf.c
new file mode 100644
index 0000000..ab2b68e
--- /dev/null
+++ b/src/coshf.c
@@ -0,0 +1,268 @@
+
+/*
+*  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+*
+*  This file is part of libacml_mv.
+*
+*  libacml_mv is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  libacml_mv is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with libacml_mv.  If not, see
+*  <http://www.gnu.org/licenses/>.
+*
+*/
+
+
+
+#include "../inc/libm_amd.h"
+#include "../inc/libm_util_amd.h"
+
+#define USE_SPLITEXP
+#define USE_SCALEDOUBLE_1
+#define USE_SCALEDOUBLE_2
+#define USE_INFINITYF_WITH_FLAGS
+#define USE_VALF_WITH_FLAGS
+#include "../inc/libm_inlines_amd.h"
+#undef USE_SPLITEXP
+#undef USE_SCALEDOUBLE_1
+#undef USE_SCALEDOUBLE_2
+#undef USE_INFINITYF_WITH_FLAGS
+#undef USE_VALF_WITH_FLAGS
+
+#include "../inc/libm_errno_amd.h"
+
+#ifndef WINDOWS
+/* Deal with errno for out-of-range result */
+static inline float retval_errno_erange(float x)
+{
+  struct exception exc;
+  exc.arg1 = (double)x;
+  exc.arg2 = (double)x;
+  exc.type = OVERFLOW;
+  exc.name = (char *)"coshf";
+  if (_LIB_VERSION == _SVID_)
+    {
+        exc.retval = HUGE;
+    }
+  else
+    {
+        exc.retval = infinityf_with_flags(AMD_F_OVERFLOW);
+    }
+  if (_LIB_VERSION == _POSIX_)
+    __set_errno(ERANGE);
+  else if (!matherr(&exc))
+    __set_errno(ERANGE);
+  return exc.retval;
+}
+
+#endif
+float FN_PROTOTYPE(coshf)(float fx)
+{
+  /*
+    After dealing with special cases the computation is split into
+    regions as follows:
+
+    abs(x) >= max_cosh_arg:
+    cosh(x) = sign(x)*Inf
+
+    abs(x) >= small_threshold:
+    cosh(x) = sign(x)*exp(abs(x))/2 computed using the
+    splitexp and scaleDouble functions as for exp_amd().
+
+    abs(x) < small_threshold:
+    compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0)))
+    cosh(x) is then sign(x)*z.                             */
+
+  static const double
+    /* The max argument of coshf, but stored as a double */
+    max_cosh_arg = 8.94159862922329438106e+01, /* 0x40565a9f84f82e63 */
+    thirtytwo_by_log2 = 4.61662413084468283841e+01, /* 0x40471547652b82fe */
+    log2_by_32_lead = 2.16608493356034159660e-02, /* 0x3f962e42fe000000 */
+    log2_by_32_tail = 5.68948749532545630390e-11, /* 0x3dcf473de6af278e */
+
+    small_threshold = 8*BASEDIGITS_DP64*0.30102999566398119521373889;
+//    small_threshold = 20.0;
+  /* (8*BASEDIGITS_DP64*log10of2) ' exp(-x) insignificant c.f. exp(x) */
+
+  /* Tabulated values of sinh(i) and cosh(i) for i = 0,...,36. */
+
+  static const double sinh_lead[   37] = {
+    0.00000000000000000000e+00,  /* 0x0000000000000000 */
+    1.17520119364380137839e+00,  /* 0x3ff2cd9fc44eb982 */
+    3.62686040784701857476e+00,  /* 0x400d03cf63b6e19f */
+    1.00178749274099008204e+01,  /* 0x40240926e70949ad */
+    2.72899171971277496596e+01,  /* 0x403b4a3803703630 */
+    7.42032105777887522891e+01,  /* 0x40528d0166f07374 */
+    2.01713157370279219549e+02,  /* 0x406936d22f67c805 */
+    5.48316123273246489589e+02,  /* 0x408122876ba380c9 */
+    1.49047882578955000099e+03,  /* 0x409749ea514eca65 */
+    4.05154190208278987484e+03,  /* 0x40afa7157430966f */
+    1.10132328747033916443e+04,  /* 0x40c5829dced69991 */
+    2.99370708492480553105e+04,  /* 0x40dd3c4488cb48d6 */
+    8.13773957064298447222e+04,  /* 0x40f3de1654d043f0 */
+    2.21206696003330085659e+05,  /* 0x410b00b5916a31a5 */
+    6.01302142081972560845e+05,  /* 0x412259ac48bef7e3 */
+    1.63450868623590236530e+06,  /* 0x4138f0ccafad27f6 */
+    4.44305526025387924165e+06,  /* 0x4150f2ebd0a7ffe3 */
+    1.20774763767876271158e+07,  /* 0x416709348c0ea4ed */
+    3.28299845686652474105e+07,  /* 0x417f4f22091940bb */
+    8.92411504815936237574e+07,  /* 0x419546d8f9ed26e1 */
+    2.42582597704895108938e+08,  /* 0x41aceb088b68e803 */
+    6.59407867241607308388e+08,  /* 0x41c3a6e1fd9eecfd */
+    1.79245642306579566002e+09,  /* 0x41dab5adb9c435ff */
+    4.87240172312445068359e+09,  /* 0x41f226af33b1fdc0 */
+    1.32445610649217357635e+10,  /* 0x4208ab7fb5475fb7 */
+    3.60024496686929321289e+10,  /* 0x4220c3d3920962c8 */
+    9.78648047144193725586e+10,  /* 0x4236c932696a6b5c */
+    2.66024120300899291992e+11,  /* 0x424ef822f7f6731c */
+    7.23128532145737548828e+11,  /* 0x42650bba3796379a */
+    1.96566714857202099609e+12,  /* 0x427c9aae4631c056 */
+    5.34323729076223046875e+12,  /* 0x429370470aec28ec */
+    1.45244248326237109375e+13,  /* 0x42aa6b765d8cdf6c */
+    3.94814800913403437500e+13,  /* 0x42c1f43fcc4b662c */
+    1.07321789892958031250e+14,  /* 0x42d866f34a725782 */
+    2.91730871263727437500e+14,  /* 0x42f0953e2f3a1ef7 */
+    7.93006726156715250000e+14,  /* 0x430689e221bc8d5a */
+    2.15561577355759750000e+15}; /* 0x431ea215a1d20d76 */
+
+  static const double cosh_lead[   37] = {
+    1.00000000000000000000e+00,  /* 0x3ff0000000000000 */
+    1.54308063481524371241e+00,  /* 0x3ff8b07551d9f550 */
+    3.76219569108363138810e+00,  /* 0x400e18fa0df2d9bc */
+    1.00676619957777653269e+01,  /* 0x402422a497d6185e */
+    2.73082328360164865444e+01,  /* 0x403b4ee858de3e80 */
+    7.42099485247878334349e+01,  /* 0x40528d6fcbeff3a9 */
+    2.01715636122455890700e+02,  /* 0x406936e67db9b919 */
+    5.48317035155212010977e+02,  /* 0x4081228949ba3a8b */
+    1.49047916125217807348e+03,  /* 0x409749eaa93f4e76 */
+    4.05154202549259389343e+03,  /* 0x40afa715845d8894 */
+    1.10132329201033226127e+04,  /* 0x40c5829dd053712d */
+    2.99370708659497577173e+04,  /* 0x40dd3c4489115627 */
+    8.13773957125740562333e+04,  /* 0x40f3de1654d6b543 */
+    2.21206696005590405548e+05,  /* 0x410b00b5916b6105 */
+    6.01302142082804115489e+05,  /* 0x412259ac48bf13ca */
+    1.63450868623620807193e+06,  /* 0x4138f0ccafad2d17 */
+    4.44305526025399193168e+06,  /* 0x4150f2ebd0a8005c */
+    1.20774763767876680940e+07,  /* 0x416709348c0ea503 */
+    3.28299845686652623117e+07,  /* 0x417f4f22091940bf */
+    8.92411504815936237574e+07,  /* 0x419546d8f9ed26e1 */
+    2.42582597704895138741e+08,  /* 0x41aceb088b68e804 */
+    6.59407867241607308388e+08,  /* 0x41c3a6e1fd9eecfd */
+    1.79245642306579566002e+09,  /* 0x41dab5adb9c435ff */
+    4.87240172312445068359e+09,  /* 0x41f226af33b1fdc0 */
+    1.32445610649217357635e+10,  /* 0x4208ab7fb5475fb7 */
+    3.60024496686929321289e+10,  /* 0x4220c3d3920962c8 */
+    9.78648047144193725586e+10,  /* 0x4236c932696a6b5c */
+    2.66024120300899291992e+11,  /* 0x424ef822f7f6731c */
+    7.23128532145737548828e+11,  /* 0x42650bba3796379a */
+    1.96566714857202099609e+12,  /* 0x427c9aae4631c056 */
+    5.34323729076223046875e+12,  /* 0x429370470aec28ec */
+    1.45244248326237109375e+13,  /* 0x42aa6b765d8cdf6c */
+    3.94814800913403437500e+13,  /* 0x42c1f43fcc4b662c */
+    1.07321789892958031250e+14,  /* 0x42d866f34a725782 */
+    2.91730871263727437500e+14,  /* 0x42f0953e2f3a1ef7 */
+    7.93006726156715250000e+14,  /* 0x430689e221bc8d5a */
+    2.15561577355759750000e+15}; /* 0x431ea215a1d20d76 */
+
+  unsigned long long ux, aux, xneg;
+  double x = fx, y, z, z1, z2;
+  int m;
+
+  /* Special cases */
+
+  GET_BITS_DP64(x, ux);
+  aux = ux & ~SIGNBIT_DP64;
+  if (aux < 0x3f10000000000000) /* |x| small enough that cosh(x) = 1 */
+    {
+      if (aux == 0) return (float)1.0; /* with no inexact */
+      if (LAMBDA_DP64 + x  > 1.0) return valf_with_flags((float)1.0, AMD_F_INEXACT); /* with inexact */
+    }
+  else if (aux >= PINFBITPATT_DP64) /* |x| is NaN or Inf */
+    {
+      if (aux > PINFBITPATT_DP64) /* |x| is a NaN? */
+         return fx + fx;
+      else    /* x is infinity */
+         return infinityf_with_flags(0);
+    }
+
+  xneg = (aux != ux);
+
+  y = x;
+  if (xneg) y = -x;
+
+  if (y >= max_cosh_arg)
+    {
+      /* Return infinity with overflow flag. */
+      /* This handles POSIX behaviour */
+      __set_errno(ERANGE);
+        z = infinityf_with_flags(AMD_F_OVERFLOW);
+    }
+  else if (y >= small_threshold)
+    {
+      /* In this range y is large enough so that
+         the negative exponential is negligible,
+         so cosh(y) is approximated by sign(x)*exp(y)/2. The
+         code below is an inlined version of that from
+         exp() with two changes (it operates on
+         y instead of x, and the division by 2 is
+         done by reducing m by 1). */
+
+      splitexp(y, 1.0, thirtytwo_by_log2, log2_by_32_lead,
+               log2_by_32_tail, &m, &z1, &z2);
+      m -= 1;
+
+      /* scaleDouble_1 is always safe because the argument x was
+         float, rather than double */
+
+      z = scaleDouble_1((z1+z2),m);
+    }
+  else
+    {
+      /* In this range we find the integer part y0 of y 
+         and the increment dy = y - y0. We then compute
+ 
+         z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy)
+         z = cosh(y) = cosh(y0)cosh(dy) + sinh(y0)sinh(dy)
+
+         where sinh(y0) and cosh(y0) are tabulated above. */
+
+      int ind;
+      double dy, dy2, sdy, cdy;
+
+      ind = (int)y;
+      dy = y - ind;
+
+      dy2 = dy*dy;
+
+      sdy = dy + dy*dy2*(0.166666666666666667013899e0 +
+			 (0.833333333333329931873097e-2 +
+			  (0.198412698413242405162014e-3 +
+			   (0.275573191913636406057211e-5 +
+			    (0.250521176994133472333666e-7 +
+			     (0.160576793121939886190847e-9 +
+			      0.7746188980094184251527126e-12*dy2)*dy2)*dy2)*dy2)*dy2)*dy2);
+
+      cdy = 1 + dy2*(0.500000000000000005911074e0 +
+		     (0.416666666666660876512776e-1 +
+		      (0.138888888889814854814536e-2 +
+		       (0.248015872460622433115785e-4 +
+			(0.275573350756016588011357e-6 +
+			 (0.208744349831471353536305e-8 +
+			  0.1163921388172173692062032e-10*dy2)*dy2)*dy2)*dy2)*dy2)*dy2);
+
+      z = cosh_lead[ind]*cdy + sinh_lead[ind]*sdy;
+    }
+
+//  if (xneg) z = - z;
+  return (float)z;
+}
+
+weak_alias (__coshf, coshf)
diff --git a/src/exp_special.c b/src/exp_special.c
new file mode 100644
index 0000000..ca32ec2
--- /dev/null
+++ b/src/exp_special.c
@@ -0,0 +1,110 @@
+
+/*
+*  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+*
+*  This file is part of libacml_mv.
+*
+*  libacml_mv is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  libacml_mv is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with libacml_mv.  If not, see
+*  <http://www.gnu.org/licenses/>.
+*
+*/
+
+#ifdef __x86_64__
+
+#include <emmintrin.h>
+#include <math.h>
+#include <errno.h>
+
+#include "../inc/libm_util_amd.h"
+#include "../inc/libm_special.h"
+
+// y = expf(x)
+// y = exp(x)
+
+// these codes and the ones in the related .S or .asm files have to match
+#define EXP_X_NAN       1
+#define EXP_Y_ZERO      2
+#define EXP_Y_INF       3
+
+float _expf_special(float x, float y, U32 code)
+{
+    switch(code)
+    {
+    case EXP_X_NAN:
+        {
+#ifdef WIN64
+            // y is assumed to be qnan, only check x for snan
+            unsigned int is_x_snan;
+            UT32 xm; xm.f32 = x;
+            is_x_snan = ( ((xm.u32 & QNAN_MASK_32) == 0) ? 1 : 0 );
+            __amd_handle_errorf(DOMAIN, EDOM, "expf", x, is_x_snan, 0.0f, 0, y, 0);
+#else
+            _mm_setcsr(_mm_getcsr() | MXCSR_ES_INVALID);
+#endif
+        }
+        break;
+
+    case EXP_Y_ZERO:
+        {
+            _mm_setcsr(_mm_getcsr() | (MXCSR_ES_INEXACT|MXCSR_ES_UNDERFLOW));
+            __amd_handle_errorf(UNDERFLOW, ERANGE, "expf", x, 0, 0.0f, 0, y, 0);
+        }
+        break;
+
+    case EXP_Y_INF:
+        {
+            _mm_setcsr(_mm_getcsr() | (MXCSR_ES_INEXACT|MXCSR_ES_OVERFLOW));
+            __amd_handle_errorf(OVERFLOW, ERANGE, "expf", x, 0, 0.0f, 0, y, 0);
+        }
+        break;
+    }
+
+
+    return y;
+}
+
+double _exp_special(double x, double y, U32 code)
+{
+    switch(code)
+    {
+    case EXP_X_NAN:
+        {
+#ifdef WIN64
+            __amd_handle_error(DOMAIN, EDOM, "exp", x, 0.0, y);
+#else
+            _mm_setcsr(_mm_getcsr() | MXCSR_ES_INVALID);
+#endif
+        }
+        break;
+
+    case EXP_Y_ZERO:
+        {
+            _mm_setcsr(_mm_getcsr() | (MXCSR_ES_INEXACT|MXCSR_ES_UNDERFLOW));
+            __amd_handle_error(UNDERFLOW, ERANGE, "exp", x, 0.0, y);
+        }
+        break;
+
+    case EXP_Y_INF:
+        {
+            _mm_setcsr(_mm_getcsr() | (MXCSR_ES_INEXACT|MXCSR_ES_OVERFLOW));
+            __amd_handle_error(OVERFLOW, ERANGE, "exp", x, 0.0, y);
+        }
+        break;
+    }
+
+
+    return y;
+}
+
+#endif /* __x86_64__ */
diff --git a/src/finite.c b/src/finite.c
new file mode 100644
index 0000000..7e7ca39
--- /dev/null
+++ b/src/finite.c
@@ -0,0 +1,60 @@
+
+/*
+*  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+*
+*  This file is part of libacml_mv.
+*
+*  libacml_mv is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  libacml_mv is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with libacml_mv.  If not, see
+*  <http://www.gnu.org/licenses/>.
+*
+*/
+
+
+#include "../inc/libm_amd.h"
+#include "../inc/libm_util_amd.h"
+
+/* Returns 0 if x is infinite or NaN, otherwise returns 1 */
+
+#ifdef WINDOWS
+int FN_PROTOTYPE(finite)(double x)
+#else
+int FN_PROTOTYPE(finite)(double x)
+#endif
+{
+
+#ifdef WINDOWS
+
+  unsigned long long ux;
+  GET_BITS_DP64(x, ux);
+  return (int)(((ux & ~SIGNBIT_DP64) - PINFBITPATT_DP64) >> 63);
+
+#else
+
+  /* This works on Hammer with gcc */
+  unsigned long ux =0x7ff0000000000000 ;
+  double temp;
+  PUT_BITS_DP64(ux, temp);
+
+ // double temp = 1.0e444; /* = infinity = 0x7ff0000000000000 */
+  volatile int retval;
+  retval = 0;
+  asm volatile ("andpd	%0, %1;" : : "x" (temp), "x" (x));
+  asm volatile ("comisd	%0, %1" : : "x" (temp), "x" (x));
+  asm volatile ("setnz	%0" : "=g" (retval));
+  return retval;
+
+#endif
+}
+
+weak_alias (__finite, finite)
diff --git a/src/finitef.c b/src/finitef.c
new file mode 100644
index 0000000..8c0613a
--- /dev/null
+++ b/src/finitef.c
@@ -0,0 +1,60 @@
+
+/*
+*  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+*
+*  This file is part of libacml_mv.
+*
+*  libacml_mv is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  libacml_mv is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with libacml_mv.  If not, see
+*  <http://www.gnu.org/licenses/>.
+*
+*/
+
+
+#include "../inc/libm_amd.h"
+#include "../inc/libm_util_amd.h"
+
+/* Returns 0 if x is infinite or NaN, otherwise returns 1 */
+
+#ifdef WINDOWS
+int FN_PROTOTYPE(finitef)(float x)
+#else
+int FN_PROTOTYPE(finitef)(float x)
+#endif
+{
+
+#ifdef WINDOWS
+
+  unsigned int ux;
+  GET_BITS_SP32(x, ux);
+  return (int)(((ux & ~SIGNBIT_SP32) - PINFBITPATT_SP32) >> 31);
+
+#else
+
+  /* This works on Hammer */
+  unsigned int ux=0x7f800000;
+  float temp;    
+  PUT_BITS_SP32(ux, temp);
+
+ /* float temp = 1.0e444; *//* = infinity = 0x7f800000 */
+  volatile int retval;
+  retval = 0;
+  asm volatile ("andps	%0, %1;" : : "x" (temp), "x" (x));
+  asm volatile ("comiss	%0, %1" : : "x" (temp), "x" (x));
+  asm volatile ("setnz	%0" : "=g" (retval));
+  return retval;
+
+#endif
+}
+
+weak_alias (__finitef, finitef)
diff --git a/src/floor.c b/src/floor.c
new file mode 100644
index 0000000..a1b99c5
--- /dev/null
+++ b/src/floor.c
@@ -0,0 +1,92 @@
+
+/*
+*  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+*
+*  This file is part of libacml_mv.
+*
+*  libacml_mv is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  libacml_mv is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with libacml_mv.  If not, see
+*  <http://www.gnu.org/licenses/>.
+*
+*/
+
+
+#include "../inc/libm_amd.h"
+#include "../inc/libm_util_amd.h"
+
+#ifdef WINDOWS
+#include "../inc/libm_errno_amd.h"
+#define USE_HANDLE_ERROR
+#include "../inc/libm_inlines_amd.h"
+#undef USE_HANDLE_ERROR
+#endif
+
+#ifdef WINDOWS
+#pragma function(floor)
+#endif
+
+double FN_PROTOTYPE(floor)(double x)
+{
+  double r;
+  long long rexp, xneg;
+
+
+  unsigned long long ux, ax, ur, mask;
+
+  GET_BITS_DP64(x, ux);
+  ax = ux & (~SIGNBIT_DP64);
+  xneg = (ux != ax);
+
+  if (ax >= 0x4340000000000000)
+    {
+      /* abs(x) is either NaN, infinity, or >= 2^53 */
+      if (ax > 0x7ff0000000000000)
+        /* x is NaN */
+#ifdef WINDOWS
+        return handle_error("floor", ux|0x0008000000000000, _DOMAIN,
+                            0, EDOM, x, 0.0);
+#else
+        return x + x; /* Raise invalid if it is a signalling NaN */
+#endif
+      else
+        return x;
+    }
+  else if (ax < 0x3ff0000000000000) /* abs(x) < 1.0 */
+    {
+      if (ax == 0x0000000000000000)
+        /* x is +zero or -zero; return the same zero */
+        return x;
+      else if (xneg) /* x < 0.0 */
+        return -1.0;
+      else
+        return 0.0;
+    }
+  else
+    {
+      r = x;
+      rexp = ((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64;
+      /* Mask out the bits of r that we don't want */
+      mask = 1;
+      mask = (mask << (EXPSHIFTBITS_DP64 - rexp)) - 1;
+      ur = (ux & ~mask);
+      PUT_BITS_DP64(ur, r);
+      if (xneg && (ur != ux))
+        /* We threw some bits away and x was negative */
+        return r - 1.0;
+      else
+        return r;
+    }
+
+}
+
+weak_alias (__floor, floor)
diff --git a/src/floorf.c b/src/floorf.c
new file mode 100644
index 0000000..e0f855b
--- /dev/null
+++ b/src/floorf.c
@@ -0,0 +1,87 @@
+
+/*
+*  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+*
+*  This file is part of libacml_mv.
+*
+*  libacml_mv is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  libacml_mv is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with libacml_mv.  If not, see
+*  <http://www.gnu.org/licenses/>.
+*
+*/
+
+
+#include "../inc/libm_amd.h"
+#include "../inc/libm_util_amd.h"
+
+#ifdef WINDOWS
+#include "../inc/libm_errno_amd.h"
+#define USE_HANDLE_ERRORF
+#include "../inc/libm_inlines_amd.h"
+#undef USE_HANDLE_ERRORF
+#endif
+
+#ifdef WINDOWS
+#pragma function(floorf)
+#endif
+
+float FN_PROTOTYPE(floorf)(float x)
+{
+  float r;
+  int rexp, xneg;
+  unsigned int ux, ax, ur, mask;
+
+  GET_BITS_SP32(x, ux);
+  ax = ux & (~SIGNBIT_SP32);
+  xneg = (ux != ax);
+
+  if (ax >= 0x4b800000)
+    {
+      /* abs(x) is either NaN, infinity, or >= 2^24 */
+      if (ax > 0x7f800000)
+        /* x is NaN */
+#ifdef WINDOWS
+        return handle_errorf("floorf", ux|0x00400000, _DOMAIN,
+                             0, EDOM, x, 0.0F);
+#else
+        return x + x; /* Raise invalid if it is a signalling NaN */
+#endif
+      else
+        return x;
+    }
+  else if (ax < 0x3f800000) /* abs(x) < 1.0 */
+    {
+      if (ax == 0x00000000)
+        /* x is +zero or -zero; return the same zero */
+        return x;
+      else if (xneg) /* x < 0.0 */
+        return -1.0F;
+      else
+        return 0.0F;
+    }
+  else
+    {
+      rexp = ((ux & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
+      /* Mask out the bits of r that we don't want */
+      mask = (1 << (EXPSHIFTBITS_SP32 - rexp)) - 1;
+      ur = (ux & ~mask);
+      PUT_BITS_SP32(ur, r);
+      if (xneg && (ux != ur))
+        /* We threw some bits away and x was negative */
+        return r - 1.0F;
+      else
+        return r;
+    }
+}
+
+weak_alias (__floorf, floorf)
diff --git a/src/frexp.c b/src/frexp.c
new file mode 100644
index 0000000..0ae109c
--- /dev/null
+++ b/src/frexp.c
@@ -0,0 +1,54 @@
+
+/*
+*  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+*
+*  This file is part of libacml_mv.
+*
+*  libacml_mv is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  libacml_mv is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with libacml_mv.  If not, see
+*  <http://www.gnu.org/licenses/>.
+*
+*/
+
+
+#include "../inc/libm_amd.h"
+#include "../inc/libm_util_amd.h"
+
+
+double FN_PROTOTYPE(frexp)(double value, int *exp)
+{
+    UT64 val;
+    unsigned int sign;
+    int exponent;
+    val.f64 = value;
+    sign = val.u32[1] & SIGNBIT_SP32;
+    val.u32[1] = val.u32[1] & ~SIGNBIT_SP32; /* remove the sign bit */
+    *exp = 0;
+    if((val.f64 == 0.0) || ((val.u32[1] & 0x7ff00000)== 0x7ff00000)) 
+        return value; /* value= +-0 or value= nan or value = +-inf return value */
+
+    exponent = val.u32[1] >> 20; /* get the exponent */
+
+    if(exponent == 0)/*x is denormal*/
+    {
+		val.f64 = val.f64 * VAL_2PMULTIPLIER_DP;/*multiply by 2^53 to bring it to the normal range*/
+        exponent = val.u32[1] >> 20; /* get the exponent */
+		exponent = exponent - MULTIPLIER_DP;
+    }
+
+	exponent -= 1022; /* remove bias(1023)-1 */
+    *exp = exponent; /* set the integral power of two */
+    val.u32[1] = sign | 0x3fe00000 | (val.u32[1] & 0x000fffff);/* make the fractional part(divide by 2) */                                              
+    return val.f64;
+}
+
diff --git a/src/frexpf.c b/src/frexpf.c
new file mode 100644
index 0000000..e2b4ece
--- /dev/null
+++ b/src/frexpf.c
@@ -0,0 +1,55 @@
+
+/*
+*  Copyright (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+*
+*  This file is part of libacml_mv.
+*
+*  libacml_mv is free software; you can redistribute it and/or
+*  modify it under the terms of the GNU Lesser General Public
+*  License as published by the Free Software Foundation; either
+*  version 2.1 of the License, or (at your option) any later version.
+*
+*  libacml_mv is distributed in the hope that it will be useful,
+*  but WITHOUT ANY WARRANTY; without even the implied warranty of
+*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+*  Lesser General Public License for more details.
+*
+*  You should have received a copy of the GNU Lesser General Public
+*  License along with libacml_mv.  If not, see
+*  <http://www.gnu.org/licenses/>.
+*
+*/
+
+
+#include "../inc/libm_amd.h"
+#include "../inc/libm_util_amd.h"
+
+
+
+float FN_PROTOTYPE(frexpf)(float value, int *exp)
+{
+    UT32 val;
+    unsigned int sign;
+    int exponent;
+    val.f32 = value;
+    sign = val.u32 & SIGNBIT_SP32;
+    val.u32 = val.u32 & ~SIGNBIT_SP32; /* remove the sign bit */
+    *exp = 0;
+    if((val.f32 == 0.0) || ((val.u32 & 0x7f800000)== 0x7f800000)) 
+        return value; /* value= +-0 or value= nan or value = +-inf return value */
+
+    exponent = val.u32 >> 23; /* get the exponent */
+
+	if(exponent == 0)/*x is denormal*/
+	{
+		val.f32 = val.f32 * VAL_2PMULTIPLIER_SP;/*multiply by 2^24 to bring it to the normal range*/
+		exponent = (val.u32 >> 23); /* get the exponent */
+		exponent = exponent - MULTIPLIER_SP;
+	}
+
+    exponent -= 126; /* remove bias(127)-1 */
+    *exp = exponent; /* set the integral power of two */
+    val.u32 = sign | 0x3f000000 | (val.u32 & 0x007fffff);/* make the fractional part(divide by 2) */                                              
+    return val.f32;
+}
+
diff --git a/src/gas/cbrt.S b/src/gas/cbrt.S
new file mode 100644
index 0000000..b733a1a
--- /dev/null
+++ b/src/gas/cbrt.S
@@ -0,0 +1,1575 @@
+
+#
+#  (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+#  This file is part of libacml_mv.
+#
+#  libacml_mv is free software; you can redistribute it and/or
+#  modify it under the terms of the GNU Lesser General Public
+#  License as published by the Free Software Foundation; either
+#  version 2.1 of the License, or (at your option) any later version.
+#
+#  libacml_mv is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  Lesser General Public License for more details.
+#
+#  You should have received a copy of the GNU Lesser General Public
+#  License along with libacml_mv.  If not, see
+#  <http://www.gnu.org/licenses/>.
+#
+#
+
+
+# cbrt.S
+#
+# An implementation of the cbrt libm function.
+#
+# Prototype:
+#
+#     double cbrt(double x);
+#
+
+#
+#   Algorithm:
+#
+
+#include "fn_macros.h"
+#define fname FN_PROTOTYPE(cbrt)
+#define fname_special _cbrt_special
+
+
+# local variable storage offsets
+
+.equ   store_input, -0x10 
+.equ   stack_size, 0x20 
+
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.align 32
+.p2align 4,,15
+.globl fname
+.type fname,@function
+fname:
+    xor   %rdx,%rdx
+    #for the time being the stack pointer is not changed at all 
+    #Assuming that this is a leaf procedure we can avoid the decrementing and incrementing
+    #of the stack pointer. This will save some assembly operations and give us good performance
+    #results. If there is a procedure call then we need to look at the changes in the stack pointer. 
+    #sub   $stack_size, %rsp
+    movd  %xmm0,%rax
+    movsd %xmm0,%xmm6
+    mov   .L__exp_mask_64(%rip),%r10
+    mov   .L__mantissa_mask_64(%rip),%r11
+    mov   %rax,%r9
+    and   %r10,%rax # rax = stores the exponent
+    and   %r11,%r9 # r9 = stores the mantissa
+    shr   $52,%rax
+    cmp   $0X7FF,%rax
+    jz    .L__cbrt_is_Nan_Infinite
+    cmp   $0X0,%rax
+    jz    .L__cbrt_is_denormal
+    
+.align 32
+.L__cbrt_is_normal:   
+    mov   $3,%rcx   # cx is set to 3 to perform division and get the scale and remainder
+    pand  .L__sign_bit_64(%rip),%xmm6  # xmm6 contains the sign
+    sub   $0x3FF,%ax
+    #we don't need the compare as sub instruction will raise the flags. But there was no performance improvement
+    cmp   $0,%ax 
+    jge   .L__donot_change_dx
+    not   %dx
+.L__donot_change_dx:
+    idiv  %cx #Accumulator is divided by bl=3
+              #ax contains the quotient
+              #dx contains the remainder
+    mov   %dx,%cx
+    add   $0x3FF,%ax
+    shl   $52,%rax
+    add   $2,%cx
+    shl   $1,%cx
+                 #ax = Contains the quotient, Scale factor
+    mov   %rax,store_input(%rsp)
+    movsd store_input(%rsp),%xmm7 #xmm7 is the scaling factor = mf
+    #xmm0 is the modified input value from the denaormal cases 
+    pand  .L__mantissa_mask_64(%rip),%xmm0
+    por   .L__zero_point_five(%rip),%xmm0 #xmm0 = Y
+    mov   %r9,%r10
+    shr   $43,%r10 
+    shr   $44,%r9 
+    and   $0x01,%r10
+    or    $0x0100,%r9
+    add   %r9,%r10 #r10 =  index_u64
+    cvtsi2sd %r10,%xmm4 #xmm4 = index_f64
+    sub $256,%r10
+    lea .L__INV_TAB_256(%rip),%rax
+    mulsd .L__one_by_512(%rip), %xmm4  #xmm4 = F
+    subsd %xmm4,%xmm0 # xmm0 = f
+    movsd (%rax,%r10,8),%xmm4 
+    mulsd %xmm4,%xmm0  # xmm0 = r 
+   
+    #Now perform polynomial computation
+    
+    # movddup %xmm0,%xmm0 # xmm0 = r  ,r
+    shufpd  $0,%xmm0,%xmm0 # replacing movddup
+
+    mulsd   %xmm0,%xmm0 # xmm0 = r  ,r^2
+
+    movapd   %xmm0,%xmm4 # xmm4 = r  ,r^2
+    movapd   %xmm0,%xmm3 # xmm3 = r  ,r^2
+    mulpd   %xmm0,%xmm0 # xmm0 = r^2,r^4   #########
+    mulpd   %xmm0,%xmm3 # xmm3 = r^3,r^6   #########
+    movapd  %xmm3,%xmm2
+    mulpd   .L__coefficients_3_6(%rip),%xmm2 # xmm2 = [coeff3 * r^3, coeff6 * r^6]
+    mulpd   %xmm0,%xmm3 # xmm3 = r^5,r^10 We don't need r^10
+    unpckhpd %xmm3,%xmm4 #xmm4 = r^5,r
+    mulpd   .L__coefficients_2_4(%rip),%xmm0 # xmm0 = [coeff2 * r^2, coeff4 * r^4]
+    mulpd   .L__coefficients_5_1(%rip),%xmm4 # xmm4 = [coeff5 * r^5, coeff1 * r  ]
+    movapd %xmm4,%xmm3
+    unpckhpd %xmm3,%xmm3          #xmm3 = [~Don't Care ,coeff5 * r^5]
+    addsd %xmm3,%xmm2 # xmm2 = [coeff3 * r^3, coeff5 * r^5 + coeff6 * r^6]
+    addpd %xmm2,%xmm0 # xmm0 = [coeff2 * r^2 + coeff3 * r^3,coeff4 * r^4 + coeff5 * r^5 + coeff6 * r^6]  
+    movapd %xmm0,%xmm2 
+    unpckhpd %xmm2,%xmm2          #xmm3 = [~Don't Care ,coeff2 * r^2 + coeff3 * r^3]
+    addsd  %xmm2,%xmm0 # xmm0 = [~Don't Care, coeff2 * r^2 + coeff3 * r^3 + coeff4 * r^4 + coeff5 * r^5 + coeff6 * r^6]
+    addsd  %xmm4,%xmm0 # xmm0 = [~Don't Care, coeff1 * r   + coeff2 * r^2 + coeff3 * r^3 + coeff4 * r^4 + coeff5 * r^5 + coeff6 * r^6]
+    
+    # movddup %xmm0,%xmm0
+    shufpd  $0,%xmm0,%xmm0 # replacing movddup
+
+    
+    #Polynomial computation completes here
+    #Now compute the following
+    #switch(rem)
+    #{
+    #    case -2:    cbrtRem_h.u64 = 0x3fe428a2f0000000; cbrtRem_t.u64 = 0x3e531ae515c447bb; break;
+    #    case -1:    cbrtRem_h.u64 = 0x3fe965fea0000000; cbrtRem_t.u64 = 0x3e44f5b8f20ac166; break;
+    #    case 0:     cbrtRem_h.u64 = 0x3ff0000000000000; cbrtRem_t.u64 = 0x0000000000000000; break;
+    #    case 1:     cbrtRem_h.u64 = 0x3ff428a2f0000000; cbrtRem_t.u64 = 0x3e631ae515c447bb; break;
+    #    case 2:     cbrtRem_h.u64 = 0x3ff965fea0000000; cbrtRem_t.u64 = 0x3e54f5b8f20ac166; break;
+    #    default:    break;
+    #}
+    #cbrtF_h.u64 = CBRT_F_H[index_u64-256];
+    #cbrtF_t.u64 = CBRT_F_T[index_u64-256];
+    #
+    #bH = (cbrtF_h.f64 * cbrtRem_h.f64);
+    #bT = ((((cbrtF_t.f64 * cbrtRem_t.f64)) + (cbrtF_t.f64 * cbrtRem_h.f64)) + (cbrtRem_t.f64 * cbrtF_h.f64));
+    lea .L__cuberoot_remainder_h_l(%rip),%r8  # load both head and tail of the remainders cuberoot at once
+    movapd (%r8,%rcx,8),%xmm1 # xmm1 = [cbrtRem_h.f64,cbrtRem_t.f64]
+    shl $1,%r10
+    lea .L__CBRT_F_H_L_256(%rip),%rax
+    movapd (%rax,%r10,8),%xmm2 # xmm2 = [cbrtF_h.f64,cbrtF_t.f64]
+    movapd %xmm2,%xmm3     
+    psrldq $8,%xmm3           # xmm3 = [~Dont Care,cbrtF_h.f64]
+    unpcklpd %xmm2,%xmm3      # xmm3 = [cbrtF_t.f64,cbrtF_h.f64]
+
+    mulpd  %xmm1,%xmm2        # xmm2 = [(cbrtF_h.f64*cbrtRem_h.f64),(cbrtRem_t.f64*cbrtF_t.f64)]
+    mulpd  %xmm1,%xmm3        # xmm3 = [(cbrtRem_h.f64*cbrtF_t.f64),(cbrtRem_t.f64*cbrtF_h.f64)]
+    movapd %xmm3,%xmm4        
+    unpckhpd %xmm4,%xmm4      # xmm4 = [(cbrtRem_h.f64*cbrtF_t.f64),(cbrtRem_h.f64*cbrtF_t.f64)]
+    addsd    %xmm4,%xmm3      # xmm3 = [~Dont Care, ((cbrtRem_h.f64*cbrtF_t.f64) + (cbrtRem_t.f64*cbrtF_h.f64))]
+    addsd    %xmm3,%xmm2      # xmm2 = [(cbrtF_h.f64*cbrtRem_h.f64),(((cbrtRem_t.f64*cbrtF_t.f64)+(cbrtRem_h.f64*cbrtF_t.f64) + (cbrtRem_t.f64*cbrtF_h.f64))]
+                              # xmm2 = [bH,bT]
+    # Now calculate
+    #ans.f64 = (((((z * bT)) + (bT)) + (z * bH)) + (bH));
+    #ans.f64 = ans.f64 * mf;
+    #ans.u64 = ans.u64 | sign.u64;
+
+    movapd   %xmm2,%xmm3
+    unpckhpd %xmm3,%xmm3      # xmm3 = [Dont Care,bH]
+                              # also xmm0 = [z,z] = the polynomial which was computed earlier
+    mulpd    %xmm2,%xmm0      # xmm0 = [(bH*z),(bT*z)]
+    movapd   %xmm0,%xmm4      
+    unpckhpd %xmm4,%xmm4      # xmm4 = [(bH*z),(bH*z)]
+    addsd    %xmm2,%xmm0      # xmm0 = [~DontCare, ((bT*z) + bT)]
+    unpckhpd %xmm2,%xmm2      # xmm2 = [(bH),(bH)]
+    addsd    %xmm4,%xmm0      # xmm0 = [~DontCare, (((bT*z) + bT) + ( z*bH))]
+    addsd    %xmm2,%xmm0      # xmm0 = [~DontCare, ((((bT*z) + bT) + (z*bH)) + bH)] = [~Dont Care,ans.f64]
+    mulsd    %xmm7,%xmm0      # xmm0 = ans.f64 * mf; mf is the scaling factor
+    por      %xmm6,%xmm0      # restore the sign
+    #add   $stack_size, %rsp
+    ret
+    
+         
+.align 32
+.L__cbrt_is_denormal:
+    movsd  .L__one_mask_64(%rip),%xmm4
+    cmp    $0,%r9
+    jz     .L__cbrt_is_zero
+    pand  .L__sign_mask_64(%rip),%xmm0   
+    por    %xmm4,%xmm0
+    subsd  %xmm4,%xmm0
+    movd   %xmm0,%rax
+    mov    %rax,%r9
+    and    %r10,%rax # rax = stores the exponent
+    and    %r11,%r9 # r9 = stores the mantissa
+    shr    $52,%rax
+    sub    $1022,%rax
+    jmp    .L__cbrt_is_normal
+
+.align 32
+.L__cbrt_is_zero:
+    ret
+.align 32
+.L__cbrt_is_Nan_Infinite:
+    cmp $0,%r9
+    jz .L__cbrt_is_Infinite
+    mulsd %xmm0,%xmm0 #this multiplication will raise an invalid exception
+    por  .L__qnan_mask_64(%rip),%xmm0
+.L__cbrt_is_Infinite: 
+    #add   $stack_size, %rsp
+    ret
+
+.align 32 
+.L__mantissa_mask_64:      .quad 0x000FFFFFFFFFFFFF
+                           .quad 0          #this zero is necessary
+.L__qnan_mask_64:          .quad 0x0008000000000000
+.L__exp_mask_64:           .quad 0x7FF0000000000000
+                           .quad 0
+.L__zero:                  .quad 0x0000000000000000
+                           .quad 0
+.align 32                           
+.L__zero_point_five:       .quad 0x3FE0000000000000
+                           .quad 0
+.align 16
+.L__sign_mask_64:          .quad 0x7FFFFFFFFFFFFFFF 
+                           .quad 0
+.L__sign_bit_64:           .quad 0x8000000000000000 
+                           .quad 0
+.L__one_mask_64:           .quad 0x3FF0000000000000 
+                           .quad 0
+.L__one_by_512:            .quad 0x3f60000000000000
+                           .quad 0
+
+
+.align 16
+.L__denormal_factor:       .quad 0x3F7428A2F98D728B 
+                           .quad 0
+# The coeeficients are arranged in a specific order to aid parrallel multiplication
+# The numbers corresponding to each coeff corresponds to the rth order to which it is to 
+# be multiplied 
+.L__coefficients:           
+.align 32 
+.L__coefficients_5_1:       .quad 0x3fd5555555555555 # 1
+                            .quad 0x3f9ee7113506ac13 # 5
+.L__coefficients_2_4:       .quad 0xbfa511e8d2b3183b # 4
+                            .quad 0xbfbc71c71c71c71c # 2
+.L__coefficients_3_6:       .quad 0xbf98090d6221a247 # 6
+                            .quad 0x3faf9add3c0ca458 # 3
+                            .quad 0x3f93750ad588f114 # 7
+
+
+
+.align 32
+.L__cuberoot_remainder_h_l: 
+                            .quad 0x3e531ae515c447bb  # cbrt(2^-2) Low
+                            .quad 0x3FE428A2F0000000  # cbrt(2^-2) High
+                            .quad 0x3e44f5b8f20ac166  # cbrt(2^-1) Low
+                            .quad 0x3FE965FEA0000000  # cbrt(2^-1) High
+                            .quad 0x0000000000000000  # cbrt(2^0) Low
+                            .quad 0x3FF0000000000000  # cbrt(2^0) High
+                            .quad 0x3e631ae515c447bb  # cbrt(2^1) Low
+                            .quad 0x3FF428A2F0000000  # cbrt(2^1) High
+                            .quad 0x3e54f5b8f20ac166  # cbrt(2^2) Low
+                            .quad 0x3FF965FEA0000000  # cbrt(2^2) High
+
+
+
+#interleaved high and low values
+.align 32
+.L__CBRT_F_H_L_256:
+	.quad 0x0000000000000000							
+    .quad 0x3ff0000000000000
+	.quad 0x3e6e6a24c81e4294							
+    .quad 0x3ff0055380000000
+	.quad 0x3e58548511e3a785							
+    .quad 0x3ff00aa390000000
+	.quad 0x3e64eb9336ec07f6							
+    .quad 0x3ff00ff010000000
+	.quad 0x3e40ea64b8b750e1							
+    .quad 0x3ff0153920000000
+	.quad 0x3e461637cff8a53c							
+    .quad 0x3ff01a7eb0000000
+	.quad 0x3e40733bf7bd1943
+    .quad 0x3ff01fc0d0000000
+	.quad 0x3e5666911345cced
+    .quad 0x3ff024ff80000000
+	.quad 0x3e477b7a3f592f14							
+    .quad 0x3ff02a3ad0000000
+	.quad 0x3e6f18d3dd1a5402							
+    .quad 0x3ff02f72b0000000
+	.quad 0x3e2be2f5a58ee9a4							
+    .quad 0x3ff034a750000000
+	.quad 0x3e68901f8f085fa7							
+    .quad 0x3ff039d880000000
+	.quad 0x3e5c68b8cd5b5d69							
+    .quad 0x3ff03f0670000000
+	.quad 0x3e5a6b0e8624be42							
+    .quad 0x3ff0443110000000
+	.quad 0x3dbc4b22b06f68e7							
+    .quad 0x3ff0495870000000
+	.quad 0x3e60f3f0afcabe9b							
+    .quad 0x3ff04e7c80000000
+	.quad 0x3e548495bca4e1b7							
+    .quad 0x3ff0539d60000000
+	.quad 0x3e66107f1abdfdc3							
+    .quad 0x3ff058bb00000000
+	.quad 0x3e6e67261878288a							
+    .quad 0x3ff05dd570000000
+	.quad 0x3e5a6bc155286f1e							
+    .quad 0x3ff062ecc0000000
+	.quad 0x3e58a759c64a85f2							
+    .quad 0x3ff06800e0000000
+	.quad 0x3e45fce70a4a8d09							
+    .quad 0x3ff06d11e0000000
+	.quad 0x3e32f9cbf373fe1d							
+    .quad 0x3ff0721fc0000000
+	.quad 0x3e590564ce4ac359							
+    .quad 0x3ff0772a80000000
+	.quad 0x3e5ac29ce761b02f							
+    .quad 0x3ff07c3230000000
+	.quad 0x3e5cb752f497381c							
+    .quad 0x3ff08136d0000000
+	.quad 0x3e68bb9e1cfb35e0							
+    .quad 0x3ff0863860000000
+	.quad 0x3e65b4917099de90							
+    .quad 0x3ff08b36f0000000
+	.quad 0x3e5cc77ac9c65ef2							
+    .quad 0x3ff0903280000000
+	.quad 0x3e57a0f3e7be3dba							
+    .quad 0x3ff0952b10000000
+	.quad 0x3e66ec851ee0c16f							
+    .quad 0x3ff09a20a0000000
+	.quad 0x3e689449bf2946da							
+    .quad 0x3ff09f1340000000
+	.quad 0x3e698f25301ba223							
+    .quad 0x3ff0a402f0000000
+	.quad 0x3e347d5ec651f549							
+    .quad 0x3ff0a8efc0000000
+	.quad 0x3e6c33ec9a86007a							
+    .quad 0x3ff0add990000000
+	.quad 0x3e5e0b6653e92649							
+    .quad 0x3ff0b2c090000000
+	.quad 0x3e3bd64ac09d755f							
+    .quad 0x3ff0b7a4b0000000
+	.quad 0x3e2f537506f78167							
+    .quad 0x3ff0bc85f0000000
+	.quad 0x3e62c382d1b3735e							
+    .quad 0x3ff0c16450000000
+	.quad 0x3e6e20ed659f99e1							
+    .quad 0x3ff0c63fe0000000
+	.quad 0x3e586b633a9c182a							
+    .quad 0x3ff0cb18b0000000
+	.quad 0x3e445cfd5a65e777							
+    .quad 0x3ff0cfeeb0000000
+	.quad 0x3e60c8770f58bca4							
+    .quad 0x3ff0d4c1e0000000
+	.quad 0x3e6739e44b0933c5							
+    .quad 0x3ff0d99250000000
+	.quad 0x3e027dc3d9ce7bd8							
+    .quad 0x3ff0de6010000000
+	.quad 0x3e63c53c7c5a7b64							
+    .quad 0x3ff0e32b00000000
+	.quad 0x3e69669683830cec							
+    .quad 0x3ff0e7f340000000
+	.quad 0x3e68d772c39bdcc4							
+    .quad 0x3ff0ecb8d0000000
+	.quad 0x3e69b0008bcf6d7b							
+    .quad 0x3ff0f17bb0000000
+	.quad 0x3e3bbb305825ce4f							
+    .quad 0x3ff0f63bf0000000
+	.quad 0x3e6da3f4af13a406							
+    .quad 0x3ff0faf970000000
+	.quad 0x3e5f36b96f74ce86							
+    .quad 0x3ff0ffb460000000
+	.quad 0x3e165c002303f790							
+    .quad 0x3ff1046cb0000000
+	.quad 0x3e682f84095ba7d5							
+    .quad 0x3ff1092250000000
+	.quad 0x3e6d46433541b2c6							
+    .quad 0x3ff10dd560000000
+	.quad 0x3e671c3d56e93a89							
+    .quad 0x3ff11285e0000000
+	.quad 0x3e598dcef4e40012							
+    .quad 0x3ff11733d0000000
+	.quad 0x3e4530ebef17fe03							
+    .quad 0x3ff11bdf30000000
+	.quad 0x3e4e8b8fa3715066							
+    .quad 0x3ff1208800000000
+	.quad 0x3e6ab26eb3b211dc							
+    .quad 0x3ff1252e40000000
+	.quad 0x3e454dd4dc906307							
+    .quad 0x3ff129d210000000
+	.quad 0x3e5c9f962387984e							
+    .quad 0x3ff12e7350000000
+	.quad 0x3e6c62a959afec09							
+    .quad 0x3ff1331210000000
+	.quad 0x3e6638d9ac6a866a
+    .quad 0x3ff137ae60000000
+	.quad 0x3e338704eca8a22d							
+    .quad 0x3ff13c4840000000
+	.quad 0x3e4e6c9e1db14f8f							
+    .quad 0x3ff140dfa0000000
+	.quad 0x3e58744b7f9c9eaa							
+    .quad 0x3ff1457490000000
+	.quad 0x3e66c2893486373b							
+    .quad 0x3ff14a0710000000
+	.quad 0x3e5b36bce31699b7							
+    .quad 0x3ff14e9730000000
+	.quad 0x3e671e3813d200c7							
+    .quad 0x3ff15324e0000000
+	.quad 0x3e699755ab40aa88							
+    .quad 0x3ff157b030000000
+	.quad 0x3e6b45ca0e4bcfc0							
+    .quad 0x3ff15c3920000000
+	.quad 0x3e32dd090d869c5d							
+    .quad 0x3ff160bfc0000000
+	.quad 0x3e64fe0516b917da
+    .quad 0x3ff16543f0000000
+	.quad 0x3e694563226317a2							
+    .quad 0x3ff169c5d0000000
+	.quad 0x3e653d8fafc2c851							
+    .quad 0x3ff16e4560000000
+	.quad 0x3e5dcbd41fbd41a3							
+    .quad 0x3ff172c2a0000000
+	.quad 0x3e5862ff5285f59c							
+    .quad 0x3ff1773d90000000
+	.quad 0x3e63072ea97a1e1c							
+    .quad 0x3ff17bb630000000
+	.quad 0x3e52839075184805							
+    .quad 0x3ff1802c90000000
+	.quad 0x3e64b0323e9eff42							
+    .quad 0x3ff184a0a0000000
+	.quad 0x3e6b158893c45484							
+    .quad 0x3ff1891270000000
+	.quad 0x3e3149ef0fc35826							
+    .quad 0x3ff18d8210000000
+	.quad 0x3e5f2e77ea96acaa							
+    .quad 0x3ff191ef60000000
+	.quad 0x3e5200074c471a95							
+    .quad 0x3ff1965a80000000
+	.quad 0x3e63f8cc517f6f04							
+    .quad 0x3ff19ac360000000
+	.quad 0x3e660ba2e311bb55							
+    .quad 0x3ff19f2a10000000
+	.quad 0x3e64b788730bbec3							
+    .quad 0x3ff1a38e90000000
+	.quad 0x3e657090795ee20c							
+    .quad 0x3ff1a7f0e0000000
+	.quad 0x3e6d9ffe983670b1							
+    .quad 0x3ff1ac5100000000
+	.quad 0x3e62a463ff61bfda							
+    .quad 0x3ff1b0af00000000
+	.quad 0x3e69d1bc6a5e65cf							
+    .quad 0x3ff1b50ad0000000
+	.quad 0x3e68718abaa9e922							
+    .quad 0x3ff1b96480000000
+	.quad 0x3e63c2f52ffa342e							
+    .quad 0x3ff1bdbc10000000
+	.quad 0x3e60fae13ff42c80							
+    .quad 0x3ff1c21180000000
+	.quad 0x3e65440f0ef00d57							
+    .quad 0x3ff1c664d0000000
+	.quad 0x3e46fcd22d4e3c1e							
+    .quad 0x3ff1cab610000000
+	.quad 0x3e4e0c60b409e863							
+    .quad 0x3ff1cf0530000000
+	.quad 0x3e6f9cab5a5f0333							
+    .quad 0x3ff1d35230000000
+	.quad 0x3e630f24744c333d							
+    .quad 0x3ff1d79d30000000
+	.quad 0x3e4b50622a76b2fe							
+    .quad 0x3ff1dbe620000000
+	.quad 0x3e6fdb94ba595375							
+    .quad 0x3ff1e02cf0000000
+	.quad 0x3e3861b9b945a171							
+    .quad 0x3ff1e471d0000000
+	.quad 0x3e654348015188c4							
+    .quad 0x3ff1e8b490000000
+	.quad 0x3e6b54d149865523							
+    .quad 0x3ff1ecf550000000
+	.quad 0x3e6a0bb783d9de33							
+    .quad 0x3ff1f13410000000
+	.quad 0x3e6629d12b1a2157							
+    .quad 0x3ff1f570d0000000
+	.quad 0x3e6467fe35d179df							
+    .quad 0x3ff1f9ab90000000
+	.quad 0x3e69763f3e26c8f7							
+    .quad 0x3ff1fde450000000
+	.quad 0x3e53f798bb9f7679							
+    .quad 0x3ff2021b20000000
+	.quad 0x3e552e577e855898							
+    .quad 0x3ff2064ff0000000
+	.quad 0x3e6fde47e5502c3a							
+    .quad 0x3ff20a82c0000000
+	.quad 0x3e5cbd0b548d96a0							
+    .quad 0x3ff20eb3b0000000
+	.quad 0x3e6a9cd9f7be8de8							
+    .quad 0x3ff212e2a0000000
+	.quad 0x3e522bbe704886de							
+    .quad 0x3ff2170fb0000000
+	.quad 0x3e6e3dea8317f020							
+    .quad 0x3ff21b3ac0000000
+	.quad 0x3e6e812085ac8855							
+    .quad 0x3ff21f63f0000000
+	.quad 0x3e5c87144f24cb07							
+    .quad 0x3ff2238b40000000
+	.quad 0x3e61e128ee311fa2							
+    .quad 0x3ff227b0a0000000
+	.quad 0x3e5b5c163d61a2d3							
+    .quad 0x3ff22bd420000000
+	.quad 0x3e47d97e7fb90633							
+    .quad 0x3ff22ff5c0000000
+	.quad 0x3e6efe899d50f6a7							
+    .quad 0x3ff2341570000000
+	.quad 0x3e6d0333eb75de5a							
+    .quad 0x3ff2383350000000
+	.quad 0x3e40e590be73a573							
+    .quad 0x3ff23c4f60000000
+	.quad 0x3e68ce8dcac3cdd2							
+    .quad 0x3ff2406980000000
+	.quad 0x3e6ee8a48954064b							
+    .quad 0x3ff24481d0000000
+	.quad 0x3e6aa62f18461e09							
+    .quad 0x3ff2489850000000
+	.quad 0x3e601e5940986a15							
+    .quad 0x3ff24cad00000000
+	.quad 0x3e3b082f4f9b8d4c							
+    .quad 0x3ff250bfe0000000
+	.quad 0x3e6876e0e5527f5a							
+    .quad 0x3ff254d0e0000000
+	.quad 0x3e63617080831e6b							
+    .quad 0x3ff258e020000000
+	.quad 0x3e681b26e34aa4a2							
+    .quad 0x3ff25ced90000000
+	.quad 0x3e552ee66dfab0c1							
+    .quad 0x3ff260f940000000
+	.quad 0x3e5d85a5329e8819							
+    .quad 0x3ff2650320000000
+	.quad 0x3e5105c1b646b5d1							
+    .quad 0x3ff2690b40000000
+	.quad 0x3e6bb6690c1a379c							
+    .quad 0x3ff26d1190000000
+	.quad 0x3e586aeba73ce3a9							
+    .quad 0x3ff2711630000000
+	.quad 0x3e6dd16198294dd4							
+    .quad 0x3ff2751900000000
+	.quad 0x3e6454e675775e83							
+    .quad 0x3ff2791a20000000
+	.quad 0x3e63842e026197ea							
+    .quad 0x3ff27d1980000000
+	.quad 0x3e6f1ce0e70c44d2							
+    .quad 0x3ff2811720000000
+	.quad 0x3e6ad636441a5627							
+    .quad 0x3ff2851310000000
+	.quad 0x3e54c205d7212abb							
+    .quad 0x3ff2890d50000000
+	.quad 0x3e6167c86c116419							
+    .quad 0x3ff28d05d0000000
+	.quad 0x3e638ec3ef16e294							
+    .quad 0x3ff290fca0000000
+	.quad 0x3e6473fceace9321							
+    .quad 0x3ff294f1c0000000
+	.quad 0x3e67af53a836dba7							
+    .quad 0x3ff298e530000000
+	.quad 0x3e1a51f3c383b652							
+    .quad 0x3ff29cd700000000
+	.quad 0x3e63696da190822d							
+    .quad 0x3ff2a0c710000000
+	.quad 0x3e62f9adec77074b							
+    .quad 0x3ff2a4b580000000
+	.quad 0x3e38190fd5bee55f							
+    .quad 0x3ff2a8a250000000
+	.quad 0x3e4bfee8fac68e55							
+    .quad 0x3ff2ac8d70000000
+	.quad 0x3e331c9d6bc5f68a							
+    .quad 0x3ff2b076f0000000
+	.quad 0x3e689d0523737edf							
+    .quad 0x3ff2b45ec0000000
+	.quad 0x3e5a295943bf47bb							
+    .quad 0x3ff2b84500000000
+	.quad 0x3e396be32e5b3207							
+    .quad 0x3ff2bc29a0000000
+	.quad 0x3e6e44c7d909fa0e							
+    .quad 0x3ff2c00c90000000
+	.quad 0x3e2b2505da94d9ea							
+    .quad 0x3ff2c3ee00000000
+	.quad 0x3e60c851f46c9c98							
+    .quad 0x3ff2c7cdc0000000
+	.quad 0x3e5da71f7d9aa3b7							
+    .quad 0x3ff2cbabf0000000
+	.quad 0x3e6f1b605d019ef1							
+    .quad 0x3ff2cf8880000000
+	.quad 0x3e4386e8a2189563							
+    .quad 0x3ff2d36390000000
+	.quad 0x3e3b19fa5d306ba7							
+    .quad 0x3ff2d73d00000000
+	.quad 0x3e6dd749b67aef76							
+    .quad 0x3ff2db14d0000000
+	.quad 0x3e676ff6f1dc04b0							
+    .quad 0x3ff2deeb20000000
+	.quad 0x3e635a33d0b232a6							
+    .quad 0x3ff2e2bfe0000000
+	.quad 0x3e64bdc80024a4e1							
+    .quad 0x3ff2e69310000000
+	.quad 0x3e6ebd61770fd723							
+    .quad 0x3ff2ea64b0000000
+	.quad 0x3e64769fc537264d							
+    .quad 0x3ff2ee34d0000000
+	.quad 0x3e69021f429f3b98							
+    .quad 0x3ff2f20360000000
+	.quad 0x3e5ee7083efbd606							
+    .quad 0x3ff2f5d070000000
+	.quad 0x3e6ad985552a6b1a							
+    .quad 0x3ff2f99bf0000000
+	.quad 0x3e6e3df778772160							
+    .quad 0x3ff2fd65f0000000
+	.quad 0x3e6ca5d76ddc9b34							
+    .quad 0x3ff3012e70000000
+	.quad 0x3e691154ffdbaf74							
+    .quad 0x3ff304f570000000
+	.quad 0x3e667bdd57fb306a							
+    .quad 0x3ff308baf0000000
+	.quad 0x3e67dc255ac40886							
+    .quad 0x3ff30c7ef0000000
+	.quad 0x3df219f38e8afafe							
+    .quad 0x3ff3104180000000
+	.quad 0x3e62416bf9669a04							
+    .quad 0x3ff3140280000000
+	.quad 0x3e611c96b2b3987f							
+    .quad 0x3ff317c210000000
+	.quad 0x3e6f99ed447e1177							
+    .quad 0x3ff31b8020000000
+	.quad 0x3e13245826328a11							
+    .quad 0x3ff31f3cd0000000
+	.quad 0x3e66f56dd1e645f8							
+    .quad 0x3ff322f7f0000000
+	.quad 0x3e46164946945535							
+    .quad 0x3ff326b1b0000000
+	.quad 0x3e5e37d59d190028							
+    .quad 0x3ff32a69f0000000
+	.quad 0x3e668671f12bf828							
+    .quad 0x3ff32e20c0000000
+	.quad 0x3e6e8ecbca6aabbd							
+    .quad 0x3ff331d620000000
+	.quad 0x3e53f49e109a5912							
+    .quad 0x3ff3358a20000000
+	.quad 0x3e6b8a0e11ec3043							
+    .quad 0x3ff3393ca0000000
+	.quad 0x3e65fae00aed691a							
+    .quad 0x3ff33cedc0000000
+	.quad 0x3e6c0569bece3e4a							
+    .quad 0x3ff3409d70000000
+	.quad 0x3e605e26744efbfe							
+    .quad 0x3ff3444bc0000000
+	.quad 0x3e65b570a94be5c5							
+    .quad 0x3ff347f8a0000000
+	.quad 0x3e5d6f156ea0e063							
+    .quad 0x3ff34ba420000000
+	.quad 0x3e6e0ca7612fc484							
+    .quad 0x3ff34f4e30000000
+	.quad 0x3e4963c927b25258							
+    .quad 0x3ff352f6f0000000
+	.quad 0x3e547930aa725a5c							
+    .quad 0x3ff3569e40000000
+	.quad 0x3e58a79fe3af43b3							
+    .quad 0x3ff35a4430000000
+	.quad 0x3e5e6dc29c41bdaf							
+    .quad 0x3ff35de8c0000000
+	.quad 0x3e657a2e76f863a5							
+    .quad 0x3ff3618bf0000000
+	.quad 0x3e2ae3b61716354d							
+    .quad 0x3ff3652dd0000000
+	.quad 0x3e665fb5df6906b1							
+    .quad 0x3ff368ce40000000
+	.quad 0x3e66177d7f588f7b							
+    .quad 0x3ff36c6d60000000
+	.quad 0x3e3ad55abd091b67							
+    .quad 0x3ff3700b30000000
+	.quad 0x3e155337b2422d76							
+    .quad 0x3ff373a7a0000000
+	.quad 0x3e6084ebe86972d5							
+    .quad 0x3ff37742b0000000
+	.quad 0x3e656395808e1ea3							
+    .quad 0x3ff37adc70000000
+	.quad 0x3e61bce21b40fba7							
+    .quad 0x3ff37e74e0000000
+	.quad 0x3e5006f94605b515							
+    .quad 0x3ff3820c00000000
+	.quad 0x3e6aa676aceb1f7d							
+    .quad 0x3ff385a1c0000000
+	.quad 0x3e58229f76554ce6							
+    .quad 0x3ff3893640000000
+	.quad 0x3e6eabfc6cf57330							
+    .quad 0x3ff38cc960000000
+	.quad 0x3e64daed9c0ce8bc							
+    .quad 0x3ff3905b40000000
+	.quad 0x3e60ff1768237141							
+    .quad 0x3ff393ebd0000000
+	.quad 0x3e6575f83051b085							
+    .quad 0x3ff3977b10000000
+	.quad 0x3e42667deb523e29							
+    .quad 0x3ff39b0910000000
+	.quad 0x3e1816996954f4fd							
+    .quad 0x3ff39e95c0000000
+	.quad 0x3e587cfccf4d9cd4							
+    .quad 0x3ff3a22120000000
+	.quad 0x3e52c5d018198353							
+    .quad 0x3ff3a5ab40000000
+	.quad 0x3e6a7a898dcc34aa							
+    .quad 0x3ff3a93410000000
+	.quad 0x3e2cead6dadc36d1							
+    .quad 0x3ff3acbbb0000000
+	.quad 0x3e2a55759c498bdf							
+    .quad 0x3ff3b04200000000
+	.quad 0x3e6c414a9ef6de04							
+    .quad 0x3ff3b3c700000000
+	.quad 0x3e63e2108a6e58fa							
+    .quad 0x3ff3b74ad0000000
+	.quad 0x3e5587fd7643d77c							
+    .quad 0x3ff3bacd60000000
+	.quad 0x3e3901eb1d3ff3df							
+    .quad 0x3ff3be4eb0000000
+	.quad 0x3e6f2ccd7c812fc6							
+    .quad 0x3ff3c1ceb0000000
+	.quad 0x3e21c8ee70a01049							
+    .quad 0x3ff3c54d90000000
+	.quad 0x3e563e8d02831eec							
+    .quad 0x3ff3c8cb20000000
+	.quad 0x3e6f61a42a92c7ff
+    .quad 0x3ff3cc4770000000
+	.quad 0x3dda917399c84d24
+    .quad 0x3ff3cfc2a0000000
+	.quad 0x3e5e9197c8eec2f0
+    .quad 0x3ff3d33c80000000
+	.quad 0x3e5e6f842f5a1378
+    .quad 0x3ff3d6b530000000
+	.quad 0x3e2fac242a90a0fc	
+    .quad 0x3ff3da2cb0000000
+	.quad 0x3e535ed726610227
+    .quad 0x3ff3dda2f0000000
+	.quad 0x3e50e0d64804b15b							
+    .quad 0x3ff3e11800000000
+	.quad 0x3e0560675daba814
+    .quad 0x3ff3e48be0000000
+	.quad 0x3e637388c8768032
+    .quad 0x3ff3e7fe80000000
+	.quad 0x3e3ee3c89f9e01f5
+    .quad 0x3ff3eb7000000000
+	.quad 0x3e639f6f0d09747c
+    .quad 0x3ff3eee040000000
+	.quad 0x3e4322c327abb8f0
+    .quad 0x3ff3f24f60000000
+	.quad 0x3e6961b347c8ac80
+    .quad 0x3ff3f5bd40000000
+	.quad 0x3e63711fbbd0f118
+    .quad 0x3ff3f92a00000000
+	.quad 0x3e64fad8d7718ffb
+    .quad 0x3ff3fc9590000000
+	.quad 0x3e6fffffffffffff	
+    .quad 0x3ff3fffff0000000
+	.quad 0x3e667efa79ec35b4
+    .quad 0x3ff4036930000000
+	.quad 0x3e6a737687a254a8
+    .quad 0x3ff406d140000000
+	.quad 0x3e5bace0f87d924d
+    .quad 0x3ff40a3830000000
+	.quad 0x3e629e37c237e392
+    .quad 0x3ff40d9df0000000
+	.quad 0x3e557ce7ac3f3012
+    .quad 0x3ff4110290000000
+	.quad 0x3e682829359f8fbd	
+    .quad 0x3ff4146600000000
+	.quad 0x3e6cc9be42d14676	
+    .quad 0x3ff417c850000000
+	.quad 0x3e6a8f001c137d0b	
+    .quad 0x3ff41b2980000000
+	.quad 0x3e636127687dda05	
+    .quad 0x3ff41e8990000000
+	.quad 0x3e524dba322646f0
+    .quad 0x3ff421e880000000
+	.quad 0x3e6dc43f1ed210b4	
+    .quad 0x3ff4254640000000
+	.quad 0x3e631ae515c447bb
+    .quad 0x3ff428a2f0000000
+                         
+
+.align 32
+.L__CBRT_F_H_256:   .quad 0x3ff0000000000000
+					.quad 0x3ff0055380000000
+					.quad 0x3ff00aa390000000
+					.quad 0x3ff00ff010000000
+					.quad 0x3ff0153920000000
+					.quad 0x3ff01a7eb0000000
+					.quad 0x3ff01fc0d0000000
+					.quad 0x3ff024ff80000000
+					.quad 0x3ff02a3ad0000000
+					.quad 0x3ff02f72b0000000
+					.quad 0x3ff034a750000000
+					.quad 0x3ff039d880000000
+					.quad 0x3ff03f0670000000
+					.quad 0x3ff0443110000000
+					.quad 0x3ff0495870000000
+					.quad 0x3ff04e7c80000000
+					.quad 0x3ff0539d60000000
+					.quad 0x3ff058bb00000000
+					.quad 0x3ff05dd570000000
+					.quad 0x3ff062ecc0000000
+					.quad 0x3ff06800e0000000
+					.quad 0x3ff06d11e0000000
+					.quad 0x3ff0721fc0000000
+					.quad 0x3ff0772a80000000
+					.quad 0x3ff07c3230000000
+					.quad 0x3ff08136d0000000
+					.quad 0x3ff0863860000000
+					.quad 0x3ff08b36f0000000
+					.quad 0x3ff0903280000000
+					.quad 0x3ff0952b10000000
+					.quad 0x3ff09a20a0000000
+					.quad 0x3ff09f1340000000
+					.quad 0x3ff0a402f0000000
+					.quad 0x3ff0a8efc0000000
+					.quad 0x3ff0add990000000
+					.quad 0x3ff0b2c090000000
+					.quad 0x3ff0b7a4b0000000
+					.quad 0x3ff0bc85f0000000
+					.quad 0x3ff0c16450000000
+					.quad 0x3ff0c63fe0000000
+					.quad 0x3ff0cb18b0000000
+					.quad 0x3ff0cfeeb0000000
+					.quad 0x3ff0d4c1e0000000
+					.quad 0x3ff0d99250000000
+					.quad 0x3ff0de6010000000
+					.quad 0x3ff0e32b00000000
+					.quad 0x3ff0e7f340000000
+					.quad 0x3ff0ecb8d0000000
+					.quad 0x3ff0f17bb0000000
+					.quad 0x3ff0f63bf0000000
+					.quad 0x3ff0faf970000000
+					.quad 0x3ff0ffb460000000
+					.quad 0x3ff1046cb0000000
+					.quad 0x3ff1092250000000
+					.quad 0x3ff10dd560000000
+					.quad 0x3ff11285e0000000
+					.quad 0x3ff11733d0000000
+					.quad 0x3ff11bdf30000000
+					.quad 0x3ff1208800000000
+					.quad 0x3ff1252e40000000
+					.quad 0x3ff129d210000000
+					.quad 0x3ff12e7350000000
+					.quad 0x3ff1331210000000
+					.quad 0x3ff137ae60000000
+					.quad 0x3ff13c4840000000
+					.quad 0x3ff140dfa0000000
+					.quad 0x3ff1457490000000
+					.quad 0x3ff14a0710000000
+					.quad 0x3ff14e9730000000
+					.quad 0x3ff15324e0000000
+					.quad 0x3ff157b030000000
+					.quad 0x3ff15c3920000000
+					.quad 0x3ff160bfc0000000
+					.quad 0x3ff16543f0000000
+					.quad 0x3ff169c5d0000000
+					.quad 0x3ff16e4560000000
+					.quad 0x3ff172c2a0000000
+					.quad 0x3ff1773d90000000
+					.quad 0x3ff17bb630000000
+					.quad 0x3ff1802c90000000
+					.quad 0x3ff184a0a0000000
+					.quad 0x3ff1891270000000
+					.quad 0x3ff18d8210000000
+					.quad 0x3ff191ef60000000
+					.quad 0x3ff1965a80000000
+					.quad 0x3ff19ac360000000
+					.quad 0x3ff19f2a10000000
+					.quad 0x3ff1a38e90000000
+					.quad 0x3ff1a7f0e0000000
+					.quad 0x3ff1ac5100000000
+					.quad 0x3ff1b0af00000000
+					.quad 0x3ff1b50ad0000000
+					.quad 0x3ff1b96480000000
+					.quad 0x3ff1bdbc10000000
+					.quad 0x3ff1c21180000000
+					.quad 0x3ff1c664d0000000
+					.quad 0x3ff1cab610000000
+					.quad 0x3ff1cf0530000000
+					.quad 0x3ff1d35230000000
+					.quad 0x3ff1d79d30000000
+					.quad 0x3ff1dbe620000000
+					.quad 0x3ff1e02cf0000000
+					.quad 0x3ff1e471d0000000
+					.quad 0x3ff1e8b490000000
+					.quad 0x3ff1ecf550000000
+					.quad 0x3ff1f13410000000
+					.quad 0x3ff1f570d0000000
+					.quad 0x3ff1f9ab90000000
+					.quad 0x3ff1fde450000000
+					.quad 0x3ff2021b20000000
+					.quad 0x3ff2064ff0000000
+					.quad 0x3ff20a82c0000000
+					.quad 0x3ff20eb3b0000000
+					.quad 0x3ff212e2a0000000
+					.quad 0x3ff2170fb0000000
+					.quad 0x3ff21b3ac0000000
+					.quad 0x3ff21f63f0000000
+					.quad 0x3ff2238b40000000
+					.quad 0x3ff227b0a0000000
+					.quad 0x3ff22bd420000000
+					.quad 0x3ff22ff5c0000000
+					.quad 0x3ff2341570000000
+					.quad 0x3ff2383350000000
+					.quad 0x3ff23c4f60000000
+					.quad 0x3ff2406980000000
+					.quad 0x3ff24481d0000000
+					.quad 0x3ff2489850000000
+					.quad 0x3ff24cad00000000
+					.quad 0x3ff250bfe0000000
+					.quad 0x3ff254d0e0000000
+					.quad 0x3ff258e020000000
+					.quad 0x3ff25ced90000000
+					.quad 0x3ff260f940000000
+					.quad 0x3ff2650320000000
+					.quad 0x3ff2690b40000000
+					.quad 0x3ff26d1190000000
+					.quad 0x3ff2711630000000
+					.quad 0x3ff2751900000000
+					.quad 0x3ff2791a20000000
+					.quad 0x3ff27d1980000000
+					.quad 0x3ff2811720000000
+					.quad 0x3ff2851310000000
+					.quad 0x3ff2890d50000000
+					.quad 0x3ff28d05d0000000
+					.quad 0x3ff290fca0000000
+					.quad 0x3ff294f1c0000000
+					.quad 0x3ff298e530000000
+					.quad 0x3ff29cd700000000
+					.quad 0x3ff2a0c710000000
+					.quad 0x3ff2a4b580000000
+					.quad 0x3ff2a8a250000000
+					.quad 0x3ff2ac8d70000000
+					.quad 0x3ff2b076f0000000
+					.quad 0x3ff2b45ec0000000
+					.quad 0x3ff2b84500000000
+					.quad 0x3ff2bc29a0000000
+					.quad 0x3ff2c00c90000000
+					.quad 0x3ff2c3ee00000000
+					.quad 0x3ff2c7cdc0000000
+					.quad 0x3ff2cbabf0000000
+					.quad 0x3ff2cf8880000000
+					.quad 0x3ff2d36390000000
+					.quad 0x3ff2d73d00000000
+					.quad 0x3ff2db14d0000000
+					.quad 0x3ff2deeb20000000
+					.quad 0x3ff2e2bfe0000000
+					.quad 0x3ff2e69310000000
+					.quad 0x3ff2ea64b0000000
+					.quad 0x3ff2ee34d0000000
+					.quad 0x3ff2f20360000000
+					.quad 0x3ff2f5d070000000
+					.quad 0x3ff2f99bf0000000
+					.quad 0x3ff2fd65f0000000
+					.quad 0x3ff3012e70000000
+					.quad 0x3ff304f570000000
+					.quad 0x3ff308baf0000000
+					.quad 0x3ff30c7ef0000000
+					.quad 0x3ff3104180000000
+					.quad 0x3ff3140280000000
+					.quad 0x3ff317c210000000
+					.quad 0x3ff31b8020000000
+					.quad 0x3ff31f3cd0000000
+					.quad 0x3ff322f7f0000000
+					.quad 0x3ff326b1b0000000
+					.quad 0x3ff32a69f0000000
+					.quad 0x3ff32e20c0000000
+					.quad 0x3ff331d620000000
+					.quad 0x3ff3358a20000000
+					.quad 0x3ff3393ca0000000
+					.quad 0x3ff33cedc0000000
+					.quad 0x3ff3409d70000000
+					.quad 0x3ff3444bc0000000
+					.quad 0x3ff347f8a0000000
+					.quad 0x3ff34ba420000000
+					.quad 0x3ff34f4e30000000
+					.quad 0x3ff352f6f0000000
+					.quad 0x3ff3569e40000000
+					.quad 0x3ff35a4430000000
+					.quad 0x3ff35de8c0000000
+					.quad 0x3ff3618bf0000000
+					.quad 0x3ff3652dd0000000
+					.quad 0x3ff368ce40000000
+					.quad 0x3ff36c6d60000000
+					.quad 0x3ff3700b30000000
+					.quad 0x3ff373a7a0000000
+					.quad 0x3ff37742b0000000
+					.quad 0x3ff37adc70000000
+					.quad 0x3ff37e74e0000000
+					.quad 0x3ff3820c00000000
+					.quad 0x3ff385a1c0000000
+					.quad 0x3ff3893640000000
+					.quad 0x3ff38cc960000000
+					.quad 0x3ff3905b40000000
+					.quad 0x3ff393ebd0000000
+					.quad 0x3ff3977b10000000
+					.quad 0x3ff39b0910000000
+					.quad 0x3ff39e95c0000000
+					.quad 0x3ff3a22120000000
+					.quad 0x3ff3a5ab40000000
+					.quad 0x3ff3a93410000000
+					.quad 0x3ff3acbbb0000000
+					.quad 0x3ff3b04200000000
+					.quad 0x3ff3b3c700000000
+					.quad 0x3ff3b74ad0000000
+					.quad 0x3ff3bacd60000000
+					.quad 0x3ff3be4eb0000000
+					.quad 0x3ff3c1ceb0000000
+					.quad 0x3ff3c54d90000000
+					.quad 0x3ff3c8cb20000000
+					.quad 0x3ff3cc4770000000
+					.quad 0x3ff3cfc2a0000000
+					.quad 0x3ff3d33c80000000
+					.quad 0x3ff3d6b530000000
+					.quad 0x3ff3da2cb0000000
+					.quad 0x3ff3dda2f0000000
+					.quad 0x3ff3e11800000000
+					.quad 0x3ff3e48be0000000
+					.quad 0x3ff3e7fe80000000
+					.quad 0x3ff3eb7000000000
+					.quad 0x3ff3eee040000000
+					.quad 0x3ff3f24f60000000
+					.quad 0x3ff3f5bd40000000
+					.quad 0x3ff3f92a00000000
+					.quad 0x3ff3fc9590000000
+					.quad 0x3ff3fffff0000000
+					.quad 0x3ff4036930000000
+					.quad 0x3ff406d140000000
+					.quad 0x3ff40a3830000000
+					.quad 0x3ff40d9df0000000
+					.quad 0x3ff4110290000000
+					.quad 0x3ff4146600000000
+					.quad 0x3ff417c850000000
+					.quad 0x3ff41b2980000000
+					.quad 0x3ff41e8990000000
+					.quad 0x3ff421e880000000
+					.quad 0x3ff4254640000000
+
+.align 32
+.L__CBRT_F_T_256:	.quad 0x0000000000000000
+					.quad 0x3e6e6a24c81e4294
+					.quad 0x3e58548511e3a785
+					.quad 0x3e64eb9336ec07f6
+					.quad 0x3e40ea64b8b750e1
+					.quad 0x3e461637cff8a53c
+					.quad 0x3e40733bf7bd1943
+					.quad 0x3e5666911345cced
+					.quad 0x3e477b7a3f592f14
+					.quad 0x3e6f18d3dd1a5402
+					.quad 0x3e2be2f5a58ee9a4
+					.quad 0x3e68901f8f085fa7
+					.quad 0x3e5c68b8cd5b5d69
+					.quad 0x3e5a6b0e8624be42
+					.quad 0x3dbc4b22b06f68e7
+					.quad 0x3e60f3f0afcabe9b
+					.quad 0x3e548495bca4e1b7
+					.quad 0x3e66107f1abdfdc3
+					.quad 0x3e6e67261878288a
+					.quad 0x3e5a6bc155286f1e
+					.quad 0x3e58a759c64a85f2
+					.quad 0x3e45fce70a4a8d09
+					.quad 0x3e32f9cbf373fe1d
+					.quad 0x3e590564ce4ac359
+					.quad 0x3e5ac29ce761b02f
+					.quad 0x3e5cb752f497381c
+					.quad 0x3e68bb9e1cfb35e0
+					.quad 0x3e65b4917099de90
+					.quad 0x3e5cc77ac9c65ef2
+					.quad 0x3e57a0f3e7be3dba
+					.quad 0x3e66ec851ee0c16f
+					.quad 0x3e689449bf2946da
+					.quad 0x3e698f25301ba223
+					.quad 0x3e347d5ec651f549
+					.quad 0x3e6c33ec9a86007a
+					.quad 0x3e5e0b6653e92649
+					.quad 0x3e3bd64ac09d755f
+					.quad 0x3e2f537506f78167
+					.quad 0x3e62c382d1b3735e
+					.quad 0x3e6e20ed659f99e1
+					.quad 0x3e586b633a9c182a
+					.quad 0x3e445cfd5a65e777
+					.quad 0x3e60c8770f58bca4
+					.quad 0x3e6739e44b0933c5
+					.quad 0x3e027dc3d9ce7bd8
+					.quad 0x3e63c53c7c5a7b64
+					.quad 0x3e69669683830cec
+					.quad 0x3e68d772c39bdcc4
+					.quad 0x3e69b0008bcf6d7b
+					.quad 0x3e3bbb305825ce4f
+					.quad 0x3e6da3f4af13a406
+					.quad 0x3e5f36b96f74ce86
+					.quad 0x3e165c002303f790
+					.quad 0x3e682f84095ba7d5
+					.quad 0x3e6d46433541b2c6
+					.quad 0x3e671c3d56e93a89
+					.quad 0x3e598dcef4e40012
+					.quad 0x3e4530ebef17fe03
+					.quad 0x3e4e8b8fa3715066
+					.quad 0x3e6ab26eb3b211dc
+					.quad 0x3e454dd4dc906307
+					.quad 0x3e5c9f962387984e
+					.quad 0x3e6c62a959afec09
+					.quad 0x3e6638d9ac6a866a
+					.quad 0x3e338704eca8a22d
+					.quad 0x3e4e6c9e1db14f8f
+					.quad 0x3e58744b7f9c9eaa
+					.quad 0x3e66c2893486373b
+					.quad 0x3e5b36bce31699b7
+					.quad 0x3e671e3813d200c7
+					.quad 0x3e699755ab40aa88
+					.quad 0x3e6b45ca0e4bcfc0
+					.quad 0x3e32dd090d869c5d
+					.quad 0x3e64fe0516b917da
+					.quad 0x3e694563226317a2
+					.quad 0x3e653d8fafc2c851
+					.quad 0x3e5dcbd41fbd41a3
+					.quad 0x3e5862ff5285f59c
+					.quad 0x3e63072ea97a1e1c
+					.quad 0x3e52839075184805
+					.quad 0x3e64b0323e9eff42
+					.quad 0x3e6b158893c45484
+					.quad 0x3e3149ef0fc35826
+					.quad 0x3e5f2e77ea96acaa
+					.quad 0x3e5200074c471a95
+					.quad 0x3e63f8cc517f6f04
+					.quad 0x3e660ba2e311bb55
+					.quad 0x3e64b788730bbec3
+					.quad 0x3e657090795ee20c
+					.quad 0x3e6d9ffe983670b1
+					.quad 0x3e62a463ff61bfda
+					.quad 0x3e69d1bc6a5e65cf
+					.quad 0x3e68718abaa9e922
+					.quad 0x3e63c2f52ffa342e
+					.quad 0x3e60fae13ff42c80
+					.quad 0x3e65440f0ef00d57
+					.quad 0x3e46fcd22d4e3c1e
+					.quad 0x3e4e0c60b409e863
+					.quad 0x3e6f9cab5a5f0333
+					.quad 0x3e630f24744c333d
+					.quad 0x3e4b50622a76b2fe
+					.quad 0x3e6fdb94ba595375
+					.quad 0x3e3861b9b945a171
+					.quad 0x3e654348015188c4
+					.quad 0x3e6b54d149865523
+					.quad 0x3e6a0bb783d9de33
+					.quad 0x3e6629d12b1a2157
+					.quad 0x3e6467fe35d179df
+					.quad 0x3e69763f3e26c8f7
+					.quad 0x3e53f798bb9f7679
+					.quad 0x3e552e577e855898
+					.quad 0x3e6fde47e5502c3a
+					.quad 0x3e5cbd0b548d96a0
+					.quad 0x3e6a9cd9f7be8de8
+					.quad 0x3e522bbe704886de
+					.quad 0x3e6e3dea8317f020
+					.quad 0x3e6e812085ac8855
+					.quad 0x3e5c87144f24cb07
+					.quad 0x3e61e128ee311fa2
+					.quad 0x3e5b5c163d61a2d3
+					.quad 0x3e47d97e7fb90633
+					.quad 0x3e6efe899d50f6a7
+					.quad 0x3e6d0333eb75de5a
+					.quad 0x3e40e590be73a573
+					.quad 0x3e68ce8dcac3cdd2
+					.quad 0x3e6ee8a48954064b
+					.quad 0x3e6aa62f18461e09
+					.quad 0x3e601e5940986a15
+					.quad 0x3e3b082f4f9b8d4c
+					.quad 0x3e6876e0e5527f5a
+					.quad 0x3e63617080831e6b
+					.quad 0x3e681b26e34aa4a2
+					.quad 0x3e552ee66dfab0c1
+					.quad 0x3e5d85a5329e8819
+					.quad 0x3e5105c1b646b5d1
+					.quad 0x3e6bb6690c1a379c
+					.quad 0x3e586aeba73ce3a9
+					.quad 0x3e6dd16198294dd4
+					.quad 0x3e6454e675775e83
+					.quad 0x3e63842e026197ea
+					.quad 0x3e6f1ce0e70c44d2
+					.quad 0x3e6ad636441a5627
+					.quad 0x3e54c205d7212abb
+					.quad 0x3e6167c86c116419
+					.quad 0x3e638ec3ef16e294
+					.quad 0x3e6473fceace9321
+					.quad 0x3e67af53a836dba7
+					.quad 0x3e1a51f3c383b652
+					.quad 0x3e63696da190822d
+					.quad 0x3e62f9adec77074b
+					.quad 0x3e38190fd5bee55f
+					.quad 0x3e4bfee8fac68e55
+					.quad 0x3e331c9d6bc5f68a
+					.quad 0x3e689d0523737edf
+					.quad 0x3e5a295943bf47bb
+					.quad 0x3e396be32e5b3207
+					.quad 0x3e6e44c7d909fa0e
+					.quad 0x3e2b2505da94d9ea
+					.quad 0x3e60c851f46c9c98
+					.quad 0x3e5da71f7d9aa3b7
+					.quad 0x3e6f1b605d019ef1
+					.quad 0x3e4386e8a2189563
+					.quad 0x3e3b19fa5d306ba7
+					.quad 0x3e6dd749b67aef76
+					.quad 0x3e676ff6f1dc04b0
+					.quad 0x3e635a33d0b232a6
+					.quad 0x3e64bdc80024a4e1
+					.quad 0x3e6ebd61770fd723
+					.quad 0x3e64769fc537264d
+					.quad 0x3e69021f429f3b98
+					.quad 0x3e5ee7083efbd606
+					.quad 0x3e6ad985552a6b1a
+					.quad 0x3e6e3df778772160
+					.quad 0x3e6ca5d76ddc9b34
+					.quad 0x3e691154ffdbaf74
+					.quad 0x3e667bdd57fb306a
+					.quad 0x3e67dc255ac40886
+					.quad 0x3df219f38e8afafe
+					.quad 0x3e62416bf9669a04
+					.quad 0x3e611c96b2b3987f
+					.quad 0x3e6f99ed447e1177
+					.quad 0x3e13245826328a11
+					.quad 0x3e66f56dd1e645f8
+					.quad 0x3e46164946945535
+					.quad 0x3e5e37d59d190028
+					.quad 0x3e668671f12bf828
+					.quad 0x3e6e8ecbca6aabbd
+					.quad 0x3e53f49e109a5912
+					.quad 0x3e6b8a0e11ec3043
+					.quad 0x3e65fae00aed691a
+					.quad 0x3e6c0569bece3e4a
+					.quad 0x3e605e26744efbfe
+					.quad 0x3e65b570a94be5c5
+					.quad 0x3e5d6f156ea0e063
+					.quad 0x3e6e0ca7612fc484
+					.quad 0x3e4963c927b25258
+					.quad 0x3e547930aa725a5c
+					.quad 0x3e58a79fe3af43b3
+					.quad 0x3e5e6dc29c41bdaf
+					.quad 0x3e657a2e76f863a5
+					.quad 0x3e2ae3b61716354d
+					.quad 0x3e665fb5df6906b1
+					.quad 0x3e66177d7f588f7b
+					.quad 0x3e3ad55abd091b67
+					.quad 0x3e155337b2422d76
+					.quad 0x3e6084ebe86972d5
+					.quad 0x3e656395808e1ea3
+					.quad 0x3e61bce21b40fba7
+					.quad 0x3e5006f94605b515
+					.quad 0x3e6aa676aceb1f7d
+					.quad 0x3e58229f76554ce6
+					.quad 0x3e6eabfc6cf57330
+					.quad 0x3e64daed9c0ce8bc
+					.quad 0x3e60ff1768237141
+					.quad 0x3e6575f83051b085
+					.quad 0x3e42667deb523e29
+					.quad 0x3e1816996954f4fd
+					.quad 0x3e587cfccf4d9cd4
+					.quad 0x3e52c5d018198353
+					.quad 0x3e6a7a898dcc34aa
+					.quad 0x3e2cead6dadc36d1
+					.quad 0x3e2a55759c498bdf
+					.quad 0x3e6c414a9ef6de04
+					.quad 0x3e63e2108a6e58fa
+					.quad 0x3e5587fd7643d77c
+					.quad 0x3e3901eb1d3ff3df
+					.quad 0x3e6f2ccd7c812fc6
+					.quad 0x3e21c8ee70a01049
+					.quad 0x3e563e8d02831eec
+					.quad 0x3e6f61a42a92c7ff
+					.quad 0x3dda917399c84d24
+					.quad 0x3e5e9197c8eec2f0
+					.quad 0x3e5e6f842f5a1378
+					.quad 0x3e2fac242a90a0fc
+					.quad 0x3e535ed726610227
+					.quad 0x3e50e0d64804b15b
+					.quad 0x3e0560675daba814
+					.quad 0x3e637388c8768032
+					.quad 0x3e3ee3c89f9e01f5
+					.quad 0x3e639f6f0d09747c
+					.quad 0x3e4322c327abb8f0
+					.quad 0x3e6961b347c8ac80
+					.quad 0x3e63711fbbd0f118
+					.quad 0x3e64fad8d7718ffb
+					.quad 0x3e6fffffffffffff
+					.quad 0x3e667efa79ec35b4
+					.quad 0x3e6a737687a254a8
+					.quad 0x3e5bace0f87d924d
+					.quad 0x3e629e37c237e392
+					.quad 0x3e557ce7ac3f3012
+					.quad 0x3e682829359f8fbd
+					.quad 0x3e6cc9be42d14676
+					.quad 0x3e6a8f001c137d0b
+					.quad 0x3e636127687dda05
+					.quad 0x3e524dba322646f0
+					.quad 0x3e6dc43f1ed210b4
+
+.align 32
+.L__INV_TAB_256:    .quad 0x4000000000000000
+					.quad 0x3fffe01fe01fe020
+					.quad 0x3fffc07f01fc07f0
+					.quad 0x3fffa11caa01fa12
+					.quad 0x3fff81f81f81f820
+					.quad 0x3fff6310aca0dbb5
+					.quad 0x3fff44659e4a4271
+					.quad 0x3fff25f644230ab5
+					.quad 0x3fff07c1f07c1f08
+					.quad 0x3ffee9c7f8458e02
+					.quad 0x3ffecc07b301ecc0
+					.quad 0x3ffeae807aba01eb
+					.quad 0x3ffe9131abf0b767
+					.quad 0x3ffe741aa59750e4
+					.quad 0x3ffe573ac901e574
+					.quad 0x3ffe3a9179dc1a73
+					.quad 0x3ffe1e1e1e1e1e1e
+					.quad 0x3ffe01e01e01e01e
+					.quad 0x3ffde5d6e3f8868a
+					.quad 0x3ffdca01dca01dca
+					.quad 0x3ffdae6076b981db
+					.quad 0x3ffd92f2231e7f8a
+					.quad 0x3ffd77b654b82c34
+					.quad 0x3ffd5cac807572b2
+					.quad 0x3ffd41d41d41d41d
+					.quad 0x3ffd272ca3fc5b1a
+					.quad 0x3ffd0cb58f6ec074
+					.quad 0x3ffcf26e5c44bfc6
+					.quad 0x3ffcd85689039b0b
+					.quad 0x3ffcbe6d9601cbe7
+					.quad 0x3ffca4b3055ee191
+					.quad 0x3ffc8b265afb8a42
+					.quad 0x3ffc71c71c71c71c
+					.quad 0x3ffc5894d10d4986
+					.quad 0x3ffc3f8f01c3f8f0
+					.quad 0x3ffc26b5392ea01c
+					.quad 0x3ffc0e070381c0e0
+					.quad 0x3ffbf583ee868d8b
+					.quad 0x3ffbdd2b899406f7
+					.quad 0x3ffbc4fd65883e7b
+					.quad 0x3ffbacf914c1bad0
+					.quad 0x3ffb951e2b18ff23
+					.quad 0x3ffb7d6c3dda338b
+					.quad 0x3ffb65e2e3beee05
+					.quad 0x3ffb4e81b4e81b4f
+					.quad 0x3ffb37484ad806ce
+					.quad 0x3ffb2036406c80d9
+					.quad 0x3ffb094b31d922a4
+					.quad 0x3ffaf286bca1af28
+					.quad 0x3ffadbe87f94905e
+					.quad 0x3ffac5701ac5701b
+					.quad 0x3ffaaf1d2f87ebfd
+					.quad 0x3ffa98ef606a63be
+					.quad 0x3ffa82e65130e159
+					.quad 0x3ffa6d01a6d01a6d
+					.quad 0x3ffa574107688a4a
+					.quad 0x3ffa41a41a41a41a
+					.quad 0x3ffa2c2a87c51ca0
+					.quad 0x3ffa16d3f97a4b02
+					.quad 0x3ffa01a01a01a01a
+					.quad 0x3ff9ec8e951033d9
+					.quad 0x3ff9d79f176b682d
+					.quad 0x3ff9c2d14ee4a102
+					.quad 0x3ff9ae24ea5510da
+					.quad 0x3ff999999999999a
+					.quad 0x3ff9852f0d8ec0ff
+					.quad 0x3ff970e4f80cb872
+					.quad 0x3ff95cbb0be377ae
+					.quad 0x3ff948b0fcd6e9e0
+					.quad 0x3ff934c67f9b2ce6
+					.quad 0x3ff920fb49d0e229
+					.quad 0x3ff90d4f120190d5
+					.quad 0x3ff8f9c18f9c18fa
+					.quad 0x3ff8e6527af1373f
+					.quad 0x3ff8d3018d3018d3
+					.quad 0x3ff8bfce8062ff3a
+					.quad 0x3ff8acb90f6bf3aa
+					.quad 0x3ff899c0f601899c
+					.quad 0x3ff886e5f0abb04a
+					.quad 0x3ff87427bcc092b9
+					.quad 0x3ff8618618618618
+					.quad 0x3ff84f00c2780614
+					.quad 0x3ff83c977ab2bedd
+					.quad 0x3ff82a4a0182a4a0
+					.quad 0x3ff8181818181818
+					.quad 0x3ff8060180601806
+					.quad 0x3ff7f405fd017f40
+					.quad 0x3ff7e225515a4f1d
+					.quad 0x3ff7d05f417d05f4
+					.quad 0x3ff7beb3922e017c
+					.quad 0x3ff7ad2208e0ecc3
+					.quad 0x3ff79baa6bb6398b
+					.quad 0x3ff78a4c8178a4c8
+					.quad 0x3ff77908119ac60d
+					.quad 0x3ff767dce434a9b1
+					.quad 0x3ff756cac201756d
+					.quad 0x3ff745d1745d1746
+					.quad 0x3ff734f0c541fe8d
+					.quad 0x3ff724287f46debc
+					.quad 0x3ff713786d9c7c09
+					.quad 0x3ff702e05c0b8170
+					.quad 0x3ff6f26016f26017
+					.quad 0x3ff6e1f76b4337c7
+					.quad 0x3ff6d1a62681c861
+					.quad 0x3ff6c16c16c16c17
+					.quad 0x3ff6b1490aa31a3d
+					.quad 0x3ff6a13cd1537290
+					.quad 0x3ff691473a88d0c0
+					.quad 0x3ff6816816816817
+					.quad 0x3ff6719f3601671a
+					.quad 0x3ff661ec6a5122f9
+					.quad 0x3ff6524f853b4aa3
+					.quad 0x3ff642c8590b2164
+					.quad 0x3ff63356b88ac0de
+					.quad 0x3ff623fa77016240
+					.quad 0x3ff614b36831ae94
+					.quad 0x3ff6058160581606
+					.quad 0x3ff5f66434292dfc
+					.quad 0x3ff5e75bb8d015e7
+					.quad 0x3ff5d867c3ece2a5
+					.quad 0x3ff5c9882b931057
+					.quad 0x3ff5babcc647fa91
+					.quad 0x3ff5ac056b015ac0
+					.quad 0x3ff59d61f123ccaa
+					.quad 0x3ff58ed2308158ed
+					.quad 0x3ff5805601580560
+					.quad 0x3ff571ed3c506b3a
+					.quad 0x3ff56397ba7c52e2
+					.quad 0x3ff5555555555555
+					.quad 0x3ff54725e6bb82fe
+					.quad 0x3ff5390948f40feb
+					.quad 0x3ff52aff56a8054b
+					.quad 0x3ff51d07eae2f815
+					.quad 0x3ff50f22e111c4c5
+					.quad 0x3ff5015015015015
+					.quad 0x3ff4f38f62dd4c9b
+					.quad 0x3ff4e5e0a72f0539
+					.quad 0x3ff4d843bedc2c4c
+					.quad 0x3ff4cab88725af6e
+					.quad 0x3ff4bd3edda68fe1
+					.quad 0x3ff4afd6a052bf5b
+					.quad 0x3ff4a27fad76014a
+					.quad 0x3ff49539e3b2d067
+					.quad 0x3ff4880522014880
+					.quad 0x3ff47ae147ae147b
+					.quad 0x3ff46dce34596066
+					.quad 0x3ff460cbc7f5cf9a
+					.quad 0x3ff453d9e2c776ca
+					.quad 0x3ff446f86562d9fb
+					.quad 0x3ff43a2730abee4d
+					.quad 0x3ff42d6625d51f87
+					.quad 0x3ff420b5265e5951
+					.quad 0x3ff4141414141414
+					.quad 0x3ff40782d10e6566
+					.quad 0x3ff3fb013fb013fb
+					.quad 0x3ff3ee8f42a5af07
+					.quad 0x3ff3e22cbce4a902
+					.quad 0x3ff3d5d991aa75c6
+					.quad 0x3ff3c995a47babe7
+					.quad 0x3ff3bd60d9232955
+					.quad 0x3ff3b13b13b13b14
+					.quad 0x3ff3a524387ac822
+					.quad 0x3ff3991c2c187f63
+					.quad 0x3ff38d22d366088e
+					.quad 0x3ff3813813813814
+					.quad 0x3ff3755bd1c945ee
+					.quad 0x3ff3698df3de0748
+					.quad 0x3ff35dce5f9f2af8
+					.quad 0x3ff3521cfb2b78c1
+					.quad 0x3ff34679ace01346
+					.quad 0x3ff33ae45b57bcb2
+					.quad 0x3ff32f5ced6a1dfa
+					.quad 0x3ff323e34a2b10bf
+					.quad 0x3ff3187758e9ebb6
+					.quad 0x3ff30d190130d190
+					.quad 0x3ff301c82ac40260
+					.quad 0x3ff2f684bda12f68
+					.quad 0x3ff2eb4ea1fed14b
+					.quad 0x3ff2e025c04b8097
+					.quad 0x3ff2d50a012d50a0
+					.quad 0x3ff2c9fb4d812ca0
+					.quad 0x3ff2bef98e5a3711
+					.quad 0x3ff2b404ad012b40
+					.quad 0x3ff2a91c92f3c105
+					.quad 0x3ff29e4129e4129e
+					.quad 0x3ff293725bb804a5
+					.quad 0x3ff288b01288b013
+					.quad 0x3ff27dfa38a1ce4d
+					.quad 0x3ff27350b8812735
+					.quad 0x3ff268b37cd60127
+					.quad 0x3ff25e22708092f1
+					.quad 0x3ff2539d7e9177b2
+					.quad 0x3ff2492492492492
+					.quad 0x3ff23eb79717605b
+					.quad 0x3ff23456789abcdf
+					.quad 0x3ff22a0122a0122a
+					.quad 0x3ff21fb78121fb78
+					.quad 0x3ff21579804855e6
+					.quad 0x3ff20b470c67c0d9
+					.quad 0x3ff2012012012012
+					.quad 0x3ff1f7047dc11f70
+					.quad 0x3ff1ecf43c7fb84c
+					.quad 0x3ff1e2ef3b3fb874
+					.quad 0x3ff1d8f5672e4abd
+					.quad 0x3ff1cf06ada2811d
+					.quad 0x3ff1c522fc1ce059
+					.quad 0x3ff1bb4a4046ed29
+					.quad 0x3ff1b17c67f2bae3
+					.quad 0x3ff1a7b9611a7b96
+					.quad 0x3ff19e0119e0119e
+					.quad 0x3ff19453808ca29c
+					.quad 0x3ff18ab083902bdb
+					.quad 0x3ff1811811811812
+					.quad 0x3ff1778a191bd684
+					.quad 0x3ff16e0689427379
+					.quad 0x3ff1648d50fc3201
+					.quad 0x3ff15b1e5f75270d
+					.quad 0x3ff151b9a3fdd5c9
+					.quad 0x3ff1485f0e0acd3b
+					.quad 0x3ff13f0e8d344724
+					.quad 0x3ff135c81135c811
+					.quad 0x3ff12c8b89edc0ac
+					.quad 0x3ff12358e75d3033
+					.quad 0x3ff11a3019a74826
+					.quad 0x3ff1111111111111
+					.quad 0x3ff107fbbe011080
+					.quad 0x3ff0fef010fef011
+					.quad 0x3ff0f5edfab325a2
+					.quad 0x3ff0ecf56be69c90
+					.quad 0x3ff0e40655826011
+					.quad 0x3ff0db20a88f4696
+					.quad 0x3ff0d24456359e3a
+					.quad 0x3ff0c9714fbcda3b
+					.quad 0x3ff0c0a7868b4171
+					.quad 0x3ff0b7e6ec259dc8
+					.quad 0x3ff0af2f722eecb5
+					.quad 0x3ff0a6810a6810a7
+					.quad 0x3ff09ddba6af8360
+					.quad 0x3ff0953f39010954
+					.quad 0x3ff08cabb37565e2
+					.quad 0x3ff0842108421084
+					.quad 0x3ff07b9f29b8eae2
+					.quad 0x3ff073260a47f7c6
+					.quad 0x3ff06ab59c7912fb
+					.quad 0x3ff0624dd2f1a9fc
+					.quad 0x3ff059eea0727586
+					.quad 0x3ff05197f7d73404
+					.quad 0x3ff04949cc1664c5
+					.quad 0x3ff0410410410410
+					.quad 0x3ff038c6b78247fc
+					.quad 0x3ff03091b51f5e1a
+					.quad 0x3ff02864fc7729e9
+					.quad 0x3ff0204081020408
+					.quad 0x3ff0182436517a37
+					.quad 0x3ff0101010101010
+					.quad 0x3ff0080402010080
+					.quad 0x3ff0000000000000
+
diff --git a/src/gas/cbrtf.S b/src/gas/cbrtf.S
new file mode 100644
index 0000000..21bdd0b
--- /dev/null
+++ b/src/gas/cbrtf.S
@@ -0,0 +1,717 @@
+
+#
+#  (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+#  This file is part of libacml_mv.
+#
+#  libacml_mv is free software; you can redistribute it and/or
+#  modify it under the terms of the GNU Lesser General Public
+#  License as published by the Free Software Foundation; either
+#  version 2.1 of the License, or (at your option) any later version.
+#
+#  libacml_mv is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  Lesser General Public License for more details.
+#
+#  You should have received a copy of the GNU Lesser General Public
+#  License along with libacml_mv.  If not, see
+#  <http://www.gnu.org/licenses/>.
+#
+#
+
+
+# cbrtf.S
+#
+# An implementation of the cbrtf libm function.
+#
+# Prototype:
+#
+#     float cbrtf(float x);
+#
+
+#
+#   Algorithm:
+#
+
+#include "fn_macros.h"
+#define fname FN_PROTOTYPE(cbrtf)
+#define fname_special _cbrtf_special
+
+
+# local variable storage offsets
+
+.equ   store_input, 0x0 
+.equ   stack_size, 0x20 
+
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.align 32
+.p2align 4,,15
+.globl fname
+.type fname,@function
+fname:
+    xor   %rcx,%rcx
+    sub   $stack_size, %rsp
+    movss %xmm0, store_input(%rsp)
+    movss %xmm0,%xmm1
+    mov   store_input(%rsp),%r8
+    mov   $0x7F800000,%r10
+    mov   $0x007FFFFF,%r11
+    mov   %r8,%r9
+    and   %r10,%r8 # r8 = stores the exponent
+    and   %r11,%r9 # r9 = stores the mantissa
+    cmp   $0X7F800000,%r8
+    jz    .L__cbrtf_is_nan_infinite
+    cmp   $0X0,%r8
+    jz    .L__cbrtf_is_denormal
+.align 32
+.L__cbrtf_is_normal:   
+    cvtps2pd %xmm1,%xmm1
+    shr   $23,%r8  # exp value
+    mov   $3,%rdx # check whether always dx is set to 3
+    mov   %r8,%rax
+    movsd %xmm1,%xmm6
+    shr   $15,%r9  # index for the reciprocal
+    sub   $0x7F,%ax
+    idiv  %dl # Accumulator is divided by dl=3
+    mov   %ax,%dx
+    shr   $8,%dx #dx = Contains the remainder
+    add   $2,%dl
+                 #ax = Contains the quotient, Scale factor
+    cbw          # sign extend al to ax
+    add   $0x3FF,%ax
+    shl   $52,%rax
+    pand .L__mantissa_mask_64(%rip),%xmm1    
+    mov   %rax,store_input(%rsp)
+    movsd store_input(%rsp),%xmm7
+    movsd  .L__sign_mask_64(%rip),%xmm2
+    por .L__one_mask_64(%rip),%xmm1
+    movapd .L__coefficients(%rip),%xmm0
+    pandn %xmm1,%xmm2
+    pand  .L__sign_mask_64(%rip),%xmm6 # has the sign
+    lea .L__DoubleReciprocalTable_256(%rip),%r8
+    lea .L__CubeRootTable_256(%rip),%rax
+    movsd (%r8,%r9,8),%xmm3#reciprocal, Size of double is 8
+    movsd (%rax,%r9,8),%xmm4#cuberoot
+    mulsd %xmm2,%xmm3
+    subsd .L__one_mask_64(%rip),%xmm3
+    
+    # movddup %xmm3,%xmm3
+    shufpd $0,%xmm3,%xmm3 # replacing movddup
+    
+    mulsd %xmm3,%xmm3
+    mulpd %xmm3,%xmm0
+#######################################################################    
+#haddpd is an SSE3 instruction On using this instruction it gives a better performance    
+    #haddpd %xmm0,%xmm0
+#Following has to be commented and the above haddpd has to be uncommented if we can
+#use the SSE3 instructions
+    movapd %xmm0,%xmm3
+    unpckhpd %xmm3,%xmm3
+    addsd %xmm3,%xmm0
+#######################################################################    
+    addsd .L__one_mask_64(%rip),%xmm0
+    mulsd %xmm7,%xmm0
+    lea .L__defined_cuberoot(%rip),%rax
+    mulsd (%rax,%rdx,8),%xmm0
+    
+    mulsd %xmm4,%xmm0
+    cmp $1,%cx
+    jnz .L__final_result
+    mulsd .L__denormal_factor(%rip),%xmm0
+
+.align 32
+.L__final_result:
+    por %xmm6, %xmm0 
+    cvtsd2ss %xmm0,%xmm0
+    add   $stack_size, %rsp
+    ret
+     
+         
+.align 32
+.L__cbrtf_is_denormal:   
+    cmp $0,%r9
+    jz .L__cbrtf_is_zero
+    mulss .L__2_pow_23(%rip),%xmm1
+    movss  %xmm1, store_input(%rsp)
+    mov   $1,%cx
+    mov   store_input(%rsp),%r8
+    mov   %r8,%r9
+    and   %r10,%r8 # r8 = stores the exponent
+    and   %r11,%r9 # r9 = stores the mantissa
+    jmp .L__cbrtf_is_normal 
+
+.align 32
+.L__cbrtf_is_nan_infinite:
+    cmp $0,%r9
+    jz .L__cbrtf_is_infinite
+    mulss %xmm0,%xmm0 #this multiplication will raise an invalid exception
+    por  .L__qnan_mask_32(%rip),%xmm0
+
+.L__cbrtf_is_infinite:
+.L__cbrtf_is_one:    
+.L__cbrtf_is_zero:    
+    add   $stack_size, %rsp
+    ret
+
+.align 32 
+.L__mantissa_mask_32:      .long 0x007FFFFF
+                           .long 0          #this zero is necessary
+.align 16                           
+.L__qnan_mask_32:          .long 0x00400000
+                           .long 0
+.L__exp_mask_32:           .long 0x7F800000
+                           .long 0
+.L__zero:                  .long 0x00000000
+                           .long 0
+.align 16
+.L__mantissa_mask_64:      .quad 0x000FFFFFFFFFFFFF
+.L__2_pow_23:              .long 0x4B000000
+
+
+.align 16
+.L__sign_mask_64:          .quad 0x8000000000000000 
+                           .quad 0
+.L__one_mask_64:           .quad 0x3FF0000000000000 
+                           .quad 0
+
+.align 16
+.L__denormal_factor:       .quad 0x3F7428A2F98D728B 
+                           .quad 0
+.align 16
+.L__coefficients:
+    .quad 0xbFBC71C71C71C71C
+    .quad 0x3fd5555555555555
+.align 16
+.L__defined_cuberoot:   .quad 0x3FE428A2F98D728B
+                        .quad 0x3FE965FEA53D6E3D
+                        .quad 0x3FF0000000000000
+                        .quad 0x3FF428A2F98D728B
+                        .quad 0x3FF965FEA53D6E3D
+                         
+.align 32 
+.L__DoubleReciprocalTable_256: .quad 0X3ff0000000000000
+            .quad 0X3fefe00000000000
+            .quad 0X3fefc00000000000
+            .quad 0X3fefa00000000000
+            .quad 0X3fef800000000000
+            .quad 0X3fef600000000000
+            .quad 0X3fef400000000000
+            .quad 0X3fef200000000000
+            .quad 0X3fef000000000000
+            .quad 0X3feee00000000000
+            .quad 0X3feec00000000000
+            .quad 0X3feea00000000000
+            .quad 0X3fee900000000000
+            .quad 0X3fee700000000000
+            .quad 0X3fee500000000000
+            .quad 0X3fee300000000000
+            .quad 0X3fee100000000000
+            .quad 0X3fee000000000000
+            .quad 0X3fede00000000000
+            .quad 0X3fedc00000000000
+            .quad 0X3feda00000000000
+            .quad 0X3fed900000000000
+            .quad 0X3fed700000000000
+            .quad 0X3fed500000000000
+            .quad 0X3fed400000000000
+            .quad 0X3fed200000000000
+            .quad 0X3fed000000000000
+            .quad 0X3fecf00000000000
+            .quad 0X3fecd00000000000
+            .quad 0X3fecb00000000000
+            .quad 0X3feca00000000000
+            .quad 0X3fec800000000000
+            .quad 0X3fec700000000000
+            .quad 0X3fec500000000000
+            .quad 0X3fec300000000000
+            .quad 0X3fec200000000000
+            .quad 0X3fec000000000000
+            .quad 0X3febf00000000000
+            .quad 0X3febd00000000000
+            .quad 0X3febc00000000000
+            .quad 0X3feba00000000000
+            .quad 0X3feb900000000000
+            .quad 0X3feb700000000000
+            .quad 0X3feb600000000000
+            .quad 0X3feb400000000000
+            .quad 0X3feb300000000000
+            .quad 0X3feb200000000000
+            .quad 0X3feb000000000000
+            .quad 0X3feaf00000000000
+            .quad 0X3fead00000000000
+            .quad 0X3feac00000000000
+            .quad 0X3feaa00000000000
+            .quad 0X3fea900000000000
+            .quad 0X3fea800000000000
+            .quad 0X3fea600000000000
+            .quad 0X3fea500000000000
+            .quad 0X3fea400000000000
+            .quad 0X3fea200000000000
+            .quad 0X3fea100000000000
+            .quad 0X3fea000000000000
+            .quad 0X3fe9e00000000000
+            .quad 0X3fe9d00000000000
+            .quad 0X3fe9c00000000000
+            .quad 0X3fe9a00000000000
+            .quad 0X3fe9900000000000
+            .quad 0X3fe9800000000000
+            .quad 0X3fe9700000000000
+            .quad 0X3fe9500000000000
+            .quad 0X3fe9400000000000
+            .quad 0X3fe9300000000000
+            .quad 0X3fe9200000000000
+            .quad 0X3fe9000000000000
+            .quad 0X3fe8f00000000000
+            .quad 0X3fe8e00000000000
+            .quad 0X3fe8d00000000000
+            .quad 0X3fe8b00000000000
+            .quad 0X3fe8a00000000000
+            .quad 0X3fe8900000000000
+            .quad 0X3fe8800000000000
+            .quad 0X3fe8700000000000
+            .quad 0X3fe8600000000000
+            .quad 0X3fe8400000000000
+            .quad 0X3fe8300000000000
+            .quad 0X3fe8200000000000
+            .quad 0X3fe8100000000000
+            .quad 0X3fe8000000000000
+            .quad 0X3fe7f00000000000
+            .quad 0X3fe7e00000000000
+            .quad 0X3fe7d00000000000
+            .quad 0X3fe7b00000000000
+            .quad 0X3fe7a00000000000
+            .quad 0X3fe7900000000000
+            .quad 0X3fe7800000000000
+            .quad 0X3fe7700000000000
+            .quad 0X3fe7600000000000
+            .quad 0X3fe7500000000000
+            .quad 0X3fe7400000000000
+            .quad 0X3fe7300000000000
+            .quad 0X3fe7200000000000
+            .quad 0X3fe7100000000000
+            .quad 0X3fe7000000000000
+            .quad 0X3fe6f00000000000
+            .quad 0X3fe6e00000000000
+            .quad 0X3fe6d00000000000
+            .quad 0X3fe6c00000000000
+            .quad 0X3fe6b00000000000
+            .quad 0X3fe6a00000000000
+            .quad 0X3fe6900000000000
+            .quad 0X3fe6800000000000
+            .quad 0X3fe6700000000000
+            .quad 0X3fe6600000000000
+            .quad 0X3fe6500000000000
+            .quad 0X3fe6400000000000
+            .quad 0X3fe6300000000000
+            .quad 0X3fe6200000000000
+            .quad 0X3fe6100000000000
+            .quad 0X3fe6000000000000
+            .quad 0X3fe5f00000000000
+            .quad 0X3fe5e00000000000
+            .quad 0X3fe5d00000000000
+            .quad 0X3fe5c00000000000
+            .quad 0X3fe5b00000000000
+            .quad 0X3fe5a00000000000
+            .quad 0X3fe5900000000000
+            .quad 0X3fe5800000000000
+            .quad 0X3fe5800000000000
+            .quad 0X3fe5700000000000
+            .quad 0X3fe5600000000000
+            .quad 0X3fe5500000000000
+            .quad 0X3fe5400000000000
+            .quad 0X3fe5300000000000
+            .quad 0X3fe5200000000000
+            .quad 0X3fe5100000000000
+            .quad 0X3fe5000000000000
+            .quad 0X3fe5000000000000
+            .quad 0X3fe4f00000000000
+            .quad 0X3fe4e00000000000
+            .quad 0X3fe4d00000000000
+            .quad 0X3fe4c00000000000
+            .quad 0X3fe4b00000000000
+            .quad 0X3fe4a00000000000
+            .quad 0X3fe4a00000000000
+            .quad 0X3fe4900000000000
+            .quad 0X3fe4800000000000
+            .quad 0X3fe4700000000000
+            .quad 0X3fe4600000000000
+            .quad 0X3fe4600000000000
+            .quad 0X3fe4500000000000
+            .quad 0X3fe4400000000000
+            .quad 0X3fe4300000000000
+            .quad 0X3fe4200000000000
+            .quad 0X3fe4200000000000
+            .quad 0X3fe4100000000000
+            .quad 0X3fe4000000000000
+            .quad 0X3fe3f00000000000
+            .quad 0X3fe3e00000000000
+            .quad 0X3fe3e00000000000
+            .quad 0X3fe3d00000000000
+            .quad 0X3fe3c00000000000
+            .quad 0X3fe3b00000000000
+            .quad 0X3fe3b00000000000
+            .quad 0X3fe3a00000000000
+            .quad 0X3fe3900000000000
+            .quad 0X3fe3800000000000
+            .quad 0X3fe3800000000000
+            .quad 0X3fe3700000000000
+            .quad 0X3fe3600000000000
+            .quad 0X3fe3500000000000
+            .quad 0X3fe3500000000000
+            .quad 0X3fe3400000000000
+            .quad 0X3fe3300000000000
+            .quad 0X3fe3200000000000
+            .quad 0X3fe3200000000000
+            .quad 0X3fe3100000000000
+            .quad 0X3fe3000000000000
+            .quad 0X3fe3000000000000
+            .quad 0X3fe2f00000000000
+            .quad 0X3fe2e00000000000
+            .quad 0X3fe2e00000000000
+            .quad 0X3fe2d00000000000
+            .quad 0X3fe2c00000000000
+            .quad 0X3fe2b00000000000
+            .quad 0X3fe2b00000000000
+            .quad 0X3fe2a00000000000
+            .quad 0X3fe2900000000000
+            .quad 0X3fe2900000000000
+            .quad 0X3fe2800000000000
+            .quad 0X3fe2700000000000
+            .quad 0X3fe2700000000000
+            .quad 0X3fe2600000000000
+            .quad 0X3fe2500000000000
+            .quad 0X3fe2500000000000
+            .quad 0X3fe2400000000000
+            .quad 0X3fe2300000000000
+            .quad 0X3fe2300000000000
+            .quad 0X3fe2200000000000
+            .quad 0X3fe2100000000000
+            .quad 0X3fe2100000000000
+            .quad 0X3fe2000000000000
+            .quad 0X3fe2000000000000
+            .quad 0X3fe1f00000000000
+            .quad 0X3fe1e00000000000
+            .quad 0X3fe1e00000000000
+            .quad 0X3fe1d00000000000
+            .quad 0X3fe1c00000000000
+            .quad 0X3fe1c00000000000
+            .quad 0X3fe1b00000000000
+            .quad 0X3fe1b00000000000
+            .quad 0X3fe1a00000000000
+            .quad 0X3fe1900000000000
+            .quad 0X3fe1900000000000
+            .quad 0X3fe1800000000000
+            .quad 0X3fe1800000000000
+            .quad 0X3fe1700000000000
+            .quad 0X3fe1600000000000
+            .quad 0X3fe1600000000000
+            .quad 0X3fe1500000000000
+            .quad 0X3fe1500000000000
+            .quad 0X3fe1400000000000
+            .quad 0X3fe1300000000000
+            .quad 0X3fe1300000000000
+            .quad 0X3fe1200000000000
+            .quad 0X3fe1200000000000
+            .quad 0X3fe1100000000000
+            .quad 0X3fe1100000000000
+            .quad 0X3fe1000000000000
+            .quad 0X3fe0f00000000000
+            .quad 0X3fe0f00000000000
+            .quad 0X3fe0e00000000000
+            .quad 0X3fe0e00000000000
+            .quad 0X3fe0d00000000000
+            .quad 0X3fe0d00000000000
+            .quad 0X3fe0c00000000000
+            .quad 0X3fe0c00000000000
+            .quad 0X3fe0b00000000000
+            .quad 0X3fe0a00000000000
+            .quad 0X3fe0a00000000000
+            .quad 0X3fe0900000000000
+            .quad 0X3fe0900000000000
+            .quad 0X3fe0800000000000
+            .quad 0X3fe0800000000000
+            .quad 0X3fe0700000000000
+            .quad 0X3fe0700000000000
+            .quad 0X3fe0600000000000
+            .quad 0X3fe0600000000000
+            .quad 0X3fe0500000000000
+            .quad 0X3fe0500000000000
+            .quad 0X3fe0400000000000
+            .quad 0X3fe0400000000000
+            .quad 0X3fe0300000000000
+            .quad 0X3fe0300000000000
+            .quad 0X3fe0200000000000
+            .quad 0X3fe0200000000000
+            .quad 0X3fe0100000000000
+            .quad 0X3fe0100000000000
+            .quad 0X3fe0000000000000
+    
+.align 32
+.L__CubeRootTable_256:   .quad 0X3ff0000000000000 
+                         .quad 0X3ff00558e6547c36 
+                         .quad 0X3ff00ab8f9d2f374 
+                         .quad 0X3ff010204b673fc7 
+                         .quad 0X3ff0158eec36749b 
+                         .quad 0X3ff01b04ed9fdb53 
+                         .quad 0X3ff02082613df53c 
+                         .quad 0X3ff0260758e78308 
+                         .quad 0X3ff02b93e6b091f0 
+                         .quad 0X3ff031281ceb8ea2 
+                         .quad 0X3ff036c40e2a5e2a 
+                         .quad 0X3ff03c67cd3f7cea 
+                         .quad 0X3ff03f3c9fee224c 
+                         .quad 0X3ff044ec379f7f79 
+                         .quad 0X3ff04aa3cd578d67 
+                         .quad 0X3ff0506374d40a3d 
+                         .quad 0X3ff0562b4218a6e3 
+                         .quad 0X3ff059123d3a9848 
+                         .quad 0X3ff05ee6694e7166 
+                         .quad 0X3ff064c2ee6e07c6 
+                         .quad 0X3ff06aa7e19c01c5 
+                         .quad 0X3ff06d9d8b1decca 
+                         .quad 0X3ff0738f4b6cc8e2 
+                         .quad 0X3ff07989af9f9f59 
+                         .quad 0X3ff07c8a2611201c 
+                         .quad 0X3ff08291a9958f03 
+                         .quad 0X3ff088a208c3fe28 
+                         .quad 0X3ff08bad91dd7d8b 
+                         .quad 0X3ff091cb6588465e 
+                         .quad 0X3ff097f24eab04a1 
+                         .quad 0X3ff09b0932aee3f2 
+                         .quad 0X3ff0a13de8970de4 
+                         .quad 0X3ff0a45bc08a5ac7 
+                         .quad 0X3ff0aa9e79bfa986 
+                         .quad 0X3ff0b0eaa961ca5b 
+                         .quad 0X3ff0b4145573271c 
+                         .quad 0X3ff0ba6ee5f9aad4 
+                         .quad 0X3ff0bd9fd0dbe02d 
+                         .quad 0X3ff0c408fc1cfd4b 
+                         .quad 0X3ff0c741430e2059 
+                         .quad 0X3ff0cdb9442ea813 
+                         .quad 0X3ff0d0f905168e6c 
+                         .quad 0X3ff0d7801893d261 
+                         .quad 0X3ff0dac772091bde 
+                         .quad 0X3ff0e15dd5c330ab 
+                         .quad 0X3ff0e4ace71080a4 
+                         .quad 0X3ff0e7fe920f3037 
+                         .quad 0X3ff0eea9c37e497e 
+                         .quad 0X3ff0f203512f4314 
+                         .quad 0X3ff0f8be68db7f32 
+                         .quad 0X3ff0fc1ffa42d902 
+                         .quad 0X3ff102eb3af9ed89 
+                         .quad 0X3ff10654f1e29cfb 
+                         .quad 0X3ff109c1679c189f 
+                         .quad 0X3ff110a29f080b3d 
+                         .quad 0X3ff114176891738a 
+                         .quad 0X3ff1178f0099b429 
+                         .quad 0X3ff11e86ac2cd7ab 
+                         .quad 0X3ff12206c7cf4046 
+                         .quad 0X3ff12589c21fb842 
+                         .quad 0X3ff12c986355d0d2 
+                         .quad 0X3ff13024129645cf 
+                         .quad 0X3ff133b2b13aa0eb 
+                         .quad 0X3ff13ad8cdc48ba3 
+                         .quad 0X3ff13e70544b1d4f 
+                         .quad 0X3ff1420adb77c99a 
+                         .quad 0X3ff145a867b1bfea 
+                         .quad 0X3ff14ceca1189d6d 
+                         .quad 0X3ff15093574284e9 
+                         .quad 0X3ff1543d2473ea9b 
+                         .quad 0X3ff157ea0d433a46 
+                         .quad 0X3ff15f4d44462724 
+                         .quad 0X3ff163039bd7cde6 
+                         .quad 0X3ff166bd21c3a8e2 
+                         .quad 0X3ff16a79dad1fb59 
+                         .quad 0X3ff171fcf9aaac3d 
+                         .quad 0X3ff175c3693980c3 
+                         .quad 0X3ff1798d1f73f3ef 
+                         .quad 0X3ff17d5a2156e97f 
+                         .quad 0X3ff1812a73ea2593 
+                         .quad 0X3ff184fe1c406b8f 
+                         .quad 0X3ff18caf82b8dba4 
+                         .quad 0X3ff1908d4b38a510 
+                         .quad 0X3ff1946e7e36f7e5 
+                         .quad 0X3ff1985320ff72a2 
+                         .quad 0X3ff19c3b38e975a8 
+                         .quad 0X3ff1a026cb58453d 
+                         .quad 0X3ff1a415ddbb2c10 
+                         .quad 0X3ff1a808758d9e32 
+                         .quad 0X3ff1aff84bac98ea 
+                         .quad 0X3ff1b3f5952e1a50 
+                         .quad 0X3ff1b7f67a896220 
+                         .quad 0X3ff1bbfb0178d186 
+                         .quad 0X3ff1c0032fc3cf91 
+                         .quad 0X3ff1c40f0b3eefc4 
+                         .quad 0X3ff1c81e99cc193f 
+                         .quad 0X3ff1cc31e15aae72 
+                         .quad 0X3ff1d048e7e7b565 
+                         .quad 0X3ff1d463b37e0090 
+                         .quad 0X3ff1d8824a365852 
+                         .quad 0X3ff1dca4b237a4f7 
+                         .quad 0X3ff1e0caf1b71965 
+                         .quad 0X3ff1e4f50ef85e61 
+                         .quad 0X3ff1e923104dbe76 
+                         .quad 0X3ff1ed54fc185286 
+                         .quad 0X3ff1f18ad8c82efc 
+                         .quad 0X3ff1f5c4acdc91aa 
+                         .quad 0X3ff1fa027ee4105b 
+                         .quad 0X3ff1fe44557cc808 
+                         .quad 0X3ff2028a37548ccf 
+                         .quad 0X3ff206d42b291a95 
+                         .quad 0X3ff20b2237c8466a 
+                         .quad 0X3ff20f74641030a6 
+                         .quad 0X3ff213cab6ef77c7 
+                         .quad 0X3ff2182537656c13 
+                         .quad 0X3ff21c83ec824406 
+                         .quad 0X3ff220e6dd675180 
+                         .quad 0X3ff2254e114737d2 
+                         .quad 0X3ff229b98f66228c 
+                         .quad 0X3ff22e295f19fd31 
+                         .quad 0X3ff2329d87caabb6 
+                         .quad 0X3ff2371610f243f2 
+                         .quad 0X3ff23b93021d47da 
+                         .quad 0X3ff2401462eae0b8 
+                         .quad 0X3ff2449a3b0d1b3f 
+                         .quad 0X3ff2449a3b0d1b3f 
+                         .quad 0X3ff2492492492492 
+                         .quad 0X3ff24db370778844 
+                         .quad 0X3ff25246dd846f45 
+                         .quad 0X3ff256dee16fdfd4 
+                         .quad 0X3ff25b7b844dfe71 
+                         .quad 0X3ff2601cce474fd2 
+                         .quad 0X3ff264c2c798fbe5 
+                         .quad 0X3ff2696d789511e2 
+                         .quad 0X3ff2696d789511e2 
+                         .quad 0X3ff26e1ce9a2cd73 
+                         .quad 0X3ff272d1233edcf3 
+                         .quad 0X3ff2778a2dfba8d0 
+                         .quad 0X3ff27c4812819c13 
+                         .quad 0X3ff2810ad98f6e10 
+                         .quad 0X3ff285d28bfa6d45 
+                         .quad 0X3ff285d28bfa6d45 
+                         .quad 0X3ff28a9f32aecb79 
+                         .quad 0X3ff28f70d6afeb08 
+                         .quad 0X3ff294478118ad83 
+                         .quad 0X3ff299233b1bc38a 
+                         .quad 0X3ff299233b1bc38a 
+                         .quad 0X3ff29e040e03fdfb 
+                         .quad 0X3ff2a2ea0334a07b 
+                         .quad 0X3ff2a7d52429b556 
+                         .quad 0X3ff2acc57a7862c2 
+                         .quad 0X3ff2acc57a7862c2 
+                         .quad 0X3ff2b1bb0fcf4190 
+                         .quad 0X3ff2b6b5edf6b54a 
+                         .quad 0X3ff2bbb61ed145cf 
+                         .quad 0X3ff2c0bbac5bfa6e 
+                         .quad 0X3ff2c0bbac5bfa6e 
+                         .quad 0X3ff2c5c6a0aeb681 
+                         .quad 0X3ff2cad705fc97a6 
+                         .quad 0X3ff2cfece6945583 
+                         .quad 0X3ff2cfece6945583 
+                         .quad 0X3ff2d5084ce0a331 
+                         .quad 0X3ff2da294368924f 
+                         .quad 0X3ff2df4fd4cff7c3 
+                         .quad 0X3ff2df4fd4cff7c3 
+                         .quad 0X3ff2e47c0bd7d237 
+                         .quad 0X3ff2e9adf35eb25a 
+                         .quad 0X3ff2eee5966124e8 
+                         .quad 0X3ff2eee5966124e8 
+                         .quad 0X3ff2f422fffa1e92 
+                         .quad 0X3ff2f9663b6369b6 
+                         .quad 0X3ff2feaf53f61612 
+                         .quad 0X3ff2feaf53f61612 
+                         .quad 0X3ff303fe552aea57 
+                         .quad 0X3ff309534a9ad7ce 
+                         .quad 0X3ff309534a9ad7ce 
+                         .quad 0X3ff30eae3fff6ff3 
+                         .quad 0X3ff3140f41335c2f 
+                         .quad 0X3ff3140f41335c2f 
+                         .quad 0X3ff319765a32d7ae 
+                         .quad 0X3ff31ee3971c2b5b 
+                         .quad 0X3ff3245704302c13 
+                         .quad 0X3ff3245704302c13 
+                         .quad 0X3ff329d0add2bb20 
+                         .quad 0X3ff32f50a08b48f9 
+                         .quad 0X3ff32f50a08b48f9 
+                         .quad 0X3ff334d6e9055a5f 
+                         .quad 0X3ff33a6394110fe6 
+                         .quad 0X3ff33a6394110fe6 
+                         .quad 0X3ff33ff6aea3afed 
+                         .quad 0X3ff3459045d8331b 
+                         .quad 0X3ff3459045d8331b 
+                         .quad 0X3ff34b3066efd36b 
+                         .quad 0X3ff350d71f529dd8 
+                         .quad 0X3ff350d71f529dd8 
+                         .quad 0X3ff356847c9006b4 
+                         .quad 0X3ff35c388c5f80bf 
+                         .quad 0X3ff35c388c5f80bf 
+                         .quad 0X3ff361f35ca116ff 
+                         .quad 0X3ff361f35ca116ff 
+                         .quad 0X3ff367b4fb5e0985 
+                         .quad 0X3ff36d7d76c96d0a 
+                         .quad 0X3ff36d7d76c96d0a 
+                         .quad 0X3ff3734cdd40cd95 
+                         .quad 0X3ff379233d4cd42a 
+                         .quad 0X3ff379233d4cd42a 
+                         .quad 0X3ff37f00a5a1ef96 
+                         .quad 0X3ff37f00a5a1ef96 
+                         .quad 0X3ff384e52521006c 
+                         .quad 0X3ff38ad0cad80848 
+                         .quad 0X3ff38ad0cad80848 
+                         .quad 0X3ff390c3a602dc60 
+                         .quad 0X3ff390c3a602dc60 
+                         .quad 0X3ff396bdc60bdb88 
+                         .quad 0X3ff39cbf3a8ca7a9 
+                         .quad 0X3ff39cbf3a8ca7a9 
+                         .quad 0X3ff3a2c8134ee2d1 
+                         .quad 0X3ff3a2c8134ee2d1 
+                         .quad 0X3ff3a8d8604cefe3 
+                         .quad 0X3ff3aef031b2b706 
+                         .quad 0X3ff3aef031b2b706 
+                         .quad 0X3ff3b50f97de6de5 
+                         .quad 0X3ff3b50f97de6de5 
+                         .quad 0X3ff3bb36a36163d8 
+                         .quad 0X3ff3bb36a36163d8 
+                         .quad 0X3ff3c1656500d20a 
+                         .quad 0X3ff3c79bedb6afb8 
+                         .quad 0X3ff3c79bedb6afb8 
+                         .quad 0X3ff3cdda4eb28aa2 
+                         .quad 0X3ff3cdda4eb28aa2 
+                         .quad 0X3ff3d420995a63c0 
+                         .quad 0X3ff3d420995a63c0 
+                         .quad 0X3ff3da6edf4b9061 
+                         .quad 0X3ff3da6edf4b9061 
+                         .quad 0X3ff3e0c5325b9fc2 
+                         .quad 0X3ff3e723a499453f 
+                         .quad 0X3ff3e723a499453f 
+                         .quad 0X3ff3ed8a484d473a 
+                         .quad 0X3ff3ed8a484d473a 
+                         .quad 0X3ff3f3f92ffb72d8 
+                         .quad 0X3ff3f3f92ffb72d8 
+                         .quad 0X3ff3fa706e6394a4 
+                         .quad 0X3ff3fa706e6394a4 
+                         .quad 0X3ff400f01682764a 
+                         .quad 0X3ff400f01682764a 
+                         .quad 0X3ff407783b92e17a 
+                         .quad 0X3ff407783b92e17a 
+                         .quad 0X3ff40e08f10ea81a 
+                         .quad 0X3ff40e08f10ea81a 
+                         .quad 0X3ff414a24aafb1e6 
+                         .quad 0X3ff414a24aafb1e6 
+                         .quad 0X3ff41b445c710fa7 
+                         .quad 0X3ff41b445c710fa7 
+                         .quad 0X3ff421ef3a901411 
+                         .quad 0X3ff421ef3a901411 
+                         .quad 0X3ff428a2f98d728b 
+
+
+
+
+
+
diff --git a/src/gas/copysign.S b/src/gas/copysign.S
new file mode 100644
index 0000000..d5b96cf
--- /dev/null
+++ b/src/gas/copysign.S
@@ -0,0 +1,63 @@
+
+#
+#  (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+#  This file is part of libacml_mv.
+#
+#  libacml_mv is free software; you can redistribute it and/or
+#  modify it under the terms of the GNU Lesser General Public
+#  License as published by the Free Software Foundation; either
+#  version 2.1 of the License, or (at your option) any later version.
+#
+#  libacml_mv is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  Lesser General Public License for more details.
+#
+#  You should have received a copy of the GNU Lesser General Public
+#  License along with libacml_mv.  If not, see
+#  <http://www.gnu.org/licenses/>.
+#
+#
+
+
+#copysign.S
+#
+# An implementation of the copysign libm function.
+#
+# The copysign functions produce a value with the magnitude of x and the sign of y.
+# They produce a NaN (with the sign of y) if x is a NaN. On implementations that
+# represent a signed zero but do not treat negative zero consistently in arithmetic
+# operations, the copysign functions regard the sign of zero as positive.
+#
+#
+# Prototype:
+#
+#     double copysign(float x, float y)
+#
+#
+#
+#   Algorithm:
+#
+
+#include "fn_macros.h"
+#define fname FN_PROTOTYPE(copysign)
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.align 16
+.p2align 4,,15
+.globl fname
+.type fname,@function
+fname:
+
+	PSLLQ $1,%xmm0
+	PSRLQ $1,%xmm0
+	PSRLQ $63,%xmm1
+	PSLLQ $63,%xmm1
+	POR   %xmm1,%xmm0
+	
+    ret
diff --git a/src/gas/copysignf.S b/src/gas/copysignf.S
new file mode 100644
index 0000000..90e63d6
--- /dev/null
+++ b/src/gas/copysignf.S
@@ -0,0 +1,70 @@
+
+#
+#  (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+#  This file is part of libacml_mv.
+#
+#  libacml_mv is free software; you can redistribute it and/or
+#  modify it under the terms of the GNU Lesser General Public
+#  License as published by the Free Software Foundation; either
+#  version 2.1 of the License, or (at your option) any later version.
+#
+#  libacml_mv is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  Lesser General Public License for more details.
+#
+#  You should have received a copy of the GNU Lesser General Public
+#  License along with libacml_mv.  If not, see
+#  <http://www.gnu.org/licenses/>.
+#
+#
+
+
+#copysignf.S
+#
+# An implementation of the copysignf libm function.
+#
+# The copysign functions produce a value with the magnitude of x and the sign of y.
+# They produce a NaN (with the sign of y) if x is a NaN. On implementations that
+# represent a signed zero but do not treat negative zero consistently in arithmetic
+# operations, the copysign functions regard the sign of zero as positive.
+#
+# Prototype:
+#
+#     float copysignf(float x, float y)#
+#
+
+#
+#   Algorithm:
+#
+
+#include "fn_macros.h"
+#define fname FN_PROTOTYPE(copysignf)
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.align 16
+.p2align 4,,15
+.globl fname
+.type fname,@function
+fname:
+	#PANDN .L__fabsf_and_mask, %xmm1
+	#POR %xmm1,%xmm0 
+
+	PSLLD $1,%xmm0
+	PSRLD $1,%xmm0
+	PSRLD $31,%xmm1
+	PSLLD $31,%xmm1
+	POR   %xmm1,%xmm0
+	
+    ret
+
+#.align 16
+#.L__sign_mask:               .long 0x7FFFFFFF
+                             .long 0x0
+                             .quad 0x0
+
diff --git a/src/gas/cos.S b/src/gas/cos.S
new file mode 100644
index 0000000..dc227e0
--- /dev/null
+++ b/src/gas/cos.S
@@ -0,0 +1,485 @@
+
+#
+#  (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+#  This file is part of libacml_mv.
+#
+#  libacml_mv is free software; you can redistribute it and/or
+#  modify it under the terms of the GNU Lesser General Public
+#  License as published by the Free Software Foundation; either
+#  version 2.1 of the License, or (at your option) any later version.
+#
+#  libacml_mv is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  Lesser General Public License for more details.
+#
+#  You should have received a copy of the GNU Lesser General Public
+#  License along with libacml_mv.  If not, see
+#  <http://www.gnu.org/licenses/>.
+#
+#
+
+
+#
+# An implementation of the cos function.
+#
+# Prototype:
+#
+#     double cos(double x);
+#
+#   Computes cos(x).
+#   It will provide proper C99 return values,
+#   but may not raise floating point status bits properly.
+#   Based on the NAG C implementation.
+#
+#
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.data
+.align 32
+.L__real_7fffffffffffffff: .quad 0x07fffffffffffffff  #Sign bit zero
+                        .quad 0                       # for alignment
+.L__real_3ff0000000000000: .quad 0x03ff0000000000000  # 1.0
+                        .quad 0                    
+.L__real_3fe0000000000000: .quad 0x03fe0000000000000  # 0.5
+                        .quad 0
+.L__real_3fc5555555555555: .quad 0x03fc5555555555555  # 0.166666666666
+                        .quad 0
+.L__real_3fe45f306dc9c883: .quad 0x03fe45f306dc9c883  # twobypi
+                        .quad 0
+.L__real_3ff921fb54400000: .quad 0x03ff921fb54400000  # piby2_1
+                        .quad 0
+.L__real_3dd0b4611a626331: .quad 0x03dd0b4611a626331  # piby2_1tail
+                        .quad 0
+.L__real_3dd0b4611a600000: .quad 0x03dd0b4611a600000  # piby2_2
+                        .quad 0
+.L__real_3ba3198a2e037073: .quad 0x03ba3198a2e037073  # piby2_2tail
+                        .quad 0                   
+.L__real_fffffffff8000000: .quad 0x0fffffffff8000000  # mask for stripping head and tail
+                        .quad 0                    
+.L__real_411E848000000000: .quad 0x415312d000000000   # 5e6 0x0411E848000000000  # 5e5
+                        .quad 0
+.L__real_bfe0000000000000: .quad 0x0bfe0000000000000  # - 0.5
+                        .quad 0
+                        
+.align 32
+.Lcosarray:
+    .quad    0x3fa5555555555555                       # 0.0416667     c1
+    .quad    0
+    .quad    0xbf56c16c16c16967                       # -0.00138889   c2
+    .quad    0
+    .quad    0x3EFA01A019F4EC91                       # 2.48016e-005  c3
+    .quad    0
+    .quad    0xbE927E4FA17F667B                       # -2.75573e-007 c4
+    .quad    0
+    .quad    0x3E21EEB690382EEC                       # 2.08761e-009  c5
+    .quad    0
+    .quad    0xbDA907DB47258AA7                       # -1.13826e-011 c6
+    .quad    0
+
+.align 32
+.Lsinarray:
+    .quad    0xbfc5555555555555                       # -0.166667     s1
+    .quad    0
+    .quad    0x3f81111111110bb3                       # 0.00833333    s2
+    .quad    0
+    .quad    0xbf2a01a019e83e5c                       # -0.000198413  s3
+    .quad    0
+    .quad    0x3ec71de3796cde01                       # 2.75573e-006  s4
+    .quad    0
+    .quad    0xbe5ae600b42fdfa7                       # -2.50511e-008 s5
+    .quad    0
+    .quad    0x3de5e0b2f9a43bb8                       # 1.59181e-010  s6
+    .quad    0
+
+.text
+.align 32
+.p2align 5,,31
+
+#include "fn_macros.h"
+#define fname FN_PROTOTYPE(cos)
+#define fname_special _cos_special@PLT
+    
+# define local variable storage offsets
+.equ    p_temp,    0x30                               # temporary for get/put bits operation
+.equ    p_temp1,   0x40                               # temporary for get/put bits operation
+.equ    r,         0x50                               # pointer to r for amd_remainder_piby2
+.equ    rr,        0x60                               # pointer to rr for amd_remainder_piby2
+.equ    region,    0x70                               # pointer to region for amd_remainder_piby2
+.equ   stack_size, 0x98
+
+.globl fname
+.type  fname,@function
+
+fname:
+   sub      $stack_size, %rsp
+    xorpd   %xmm2, %xmm2                              # zeroed out for later use
+
+# GET_BITS_DP64(x, ux);
+# get the input value to an integer register.
+    movsd   %xmm0,p_temp(%rsp)
+    mov     p_temp(%rsp), %rdx                        # rdx is ux
+
+##  if NaN or inf
+    mov     $0x07ff0000000000000, %rax
+    mov     %rax, %r10
+    and     %rdx, %r10
+    cmp     %rax, %r10
+    jz      .Lcos_naninf
+
+#  ax = (ux & ~SIGNBIT_DP64);
+    mov     $0x07fffffffffffffff, %r10
+    and     %rdx, %r10                                # r10 is ax
+    mov     $1, %r8d                                  # for determining region later on
+
+
+##  if (ax <= 0x3fe921fb54442d18) /* abs(x) <= pi/4 */
+    mov     $0x03fe921fb54442d18, %rax
+    cmp     %rax, %r10
+    jg      .Lcos_reduce
+
+##      if (ax < 0x3f20000000000000) /* abs(x) < 2.0^(-13) */
+    mov     $0x03f20000000000000, %rax
+    cmp     %rax, %r10
+    jge     .Lcos_small
+
+##          if (ax < 0x3e40000000000000) /* abs(x) < 2.0^(-27) */
+    mov     $0x03e40000000000000, %rax
+    cmp     %rax, %r10
+    jge     .Lcos_smaller
+
+#                  cos = 1.0;
+    movsd   .L__real_3ff0000000000000(%rip), %xmm0    # return a 1
+    jmp     .Lcos_cleanup     
+
+##          else
+.align 16
+.Lcos_smaller:
+#              cos = 1.0 - x*x*0.5;
+    movsd   %xmm0, %xmm2
+    mulsd   %xmm2, %xmm2                # x^2
+    movsd   .L__real_3ff0000000000000(%rip), %xmm0    # 1.0
+    mulsd   .L__real_3fe0000000000000(%rip), %xmm2    # 0.5 * x^2
+    subsd   %xmm2, %xmm0
+    jmp     .Lcos_cleanup  
+           
+##      else
+
+.align 16
+.Lcos_small:
+#          cos = cos_piby4(x, 0.0);
+    movsd   %xmm0, %xmm2
+    mulsd   %xmm0, %xmm2                              # x2
+
+#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+# region 0 or 2     - do a cos calculation
+#  zc = (c2 + x2 * (c3 + x2 * (c4 + x2 * (c5 + x2 * c6))));
+
+    movsd   .Lcosarray+0x10(%rip), %xmm1              # c2
+    movsd   %xmm2, %xmm4                              # move for x4
+    mulsd   %xmm2, %xmm4                              # x4
+    movsd   .Lcosarray+0x30(%rip), %xmm3              # c4
+    mulsd   %xmm2, %xmm1                              # c2x2
+    movsd   .Lcosarray+0x50(%rip), %xmm5              # c6
+    mulsd   %xmm2, %xmm3                              # c4x2
+    movsd   %xmm4, %xmm0                              # move for x8
+    mulsd   %xmm2, %xmm5                              # c6x2
+    mulsd   %xmm4, %xmm0                              # x8
+    addsd   .Lcosarray(%rip), %xmm1                   # c1 + c2x2
+    mulsd   %xmm4, %xmm1                              # c1x4 + c2x6
+    addsd   .Lcosarray+0x20(%rip), %xmm3              # c3 + c4x2
+    mulsd   .L__real_bfe0000000000000(%rip), %xmm2    # -0.5x2, destroy xmm2
+    addsd   .Lcosarray+0x40(%rip), %xmm5              # c5 + c6x2
+    mulsd   %xmm0, %xmm3                              # c3x8 + c4x10    
+    mulsd   %xmm0, %xmm4                              # x12    
+    mulsd   %xmm5, %xmm4                              # c5x12 + c6x14
+
+    movsd   .L__real_3ff0000000000000(%rip), %xmm0    # 1    
+    addsd   %xmm3, %xmm1                              # c1x4 + c2x6 + c3x8 + c4x10
+    movsd   %xmm2, %xmm3                              # preserve -0.5x2
+    addsd   %xmm0, %xmm2                              # t = 1 - 0.5x2
+    subsd   %xmm2, %xmm0                              # 1-t
+    addsd   %xmm3, %xmm0                              # (1-t) - r
+    addsd   %xmm4, %xmm1                              # c1x4 + c2x6 + c3x8 + c4x10 + c5x12 + c6x14
+    addsd   %xmm1, %xmm0                              # (1-t) - r + c1x4 + c2x6 + c3x8 + c4x10 + c5x12 + c6x14
+    addsd   %xmm2, %xmm0                              # 1 - 0.5x2 + above
+
+    jmp     .Lcos_cleanup
+
+#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+.align 16
+.Lcos_reduce:
+
+#  xneg = (ax != ux);
+    cmp     %r10, %rdx
+
+##  if (xneg) x = -x;
+    jz      .Lpositive
+    subsd   %xmm0, %xmm2
+    movsd   %xmm2, %xmm0
+
+.Lpositive:
+##  if (x < 5.0e5)
+    cmp     .L__real_411E848000000000(%rip), %r10
+    jae     .Lcos_reduce_precise
+
+# reduce  the argument to be in a range from -pi/4 to +pi/4
+# by subtracting multiples of pi/2
+    movsd   %xmm0, %xmm2
+    movsd   .L__real_3fe45f306dc9c883(%rip), %xmm3    # twobypi
+    movsd   %xmm0, %xmm4
+    movsd   .L__real_3fe0000000000000(%rip), %xmm5    # .5
+    mulsd   %xmm3, %xmm2
+
+#/* How many pi/2 is x a multiple of? */
+#      xexp  = ax >> EXPSHIFTBITS_DP64;
+    mov     %r10, %r9
+    shr     $52, %r9                                  # >>EXPSHIFTBITS_DP64
+
+#        npi2  = (int)(x * twobypi + 0.5);
+    addsd   %xmm5, %xmm2                              # npi2
+
+    movsd   .L__real_3ff921fb54400000(%rip), %xmm3    # piby2_1
+    cvttpd2dq    %xmm2, %xmm0                         # convert to integer
+    movsd   .L__real_3dd0b4611a626331(%rip), %xmm1    # piby2_1tail
+    cvtdq2pd    %xmm0, %xmm2                          # and back to float.
+
+#      /* Subtract the multiple from x to get an extra-precision remainder */
+#      rhead  = x - npi2 * piby2_1;
+    mulsd   %xmm2, %xmm3
+    subsd   %xmm3, %xmm4                              # rhead
+
+#      rtail  = npi2 * piby2_1tail;
+    mulsd   %xmm2, %xmm1
+    movd    %xmm0, %eax
+
+#      GET_BITS_DP64(rhead-rtail, uy);             
+    movsd   %xmm4, %xmm0
+    subsd   %xmm1, %xmm0
+
+    movsd   .L__real_3dd0b4611a600000(%rip), %xmm3    # piby2_2
+    movsd   %xmm0,p_temp(%rsp)
+    movsd   .L__real_3ba3198a2e037073(%rip), %xmm5    # piby2_2tail
+    mov     p_temp(%rsp), %rcx                        # rcx is rhead-rtail
+
+#    xmm0=r, xmm4=rhead, xmm1=rtail, xmm2=npi2, xmm3=temp for calc, xmm5= temp for calc
+#      expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
+    shl     $1, %rcx                                  # strip any sign bit
+    shr     $53, %rcx                                 # >> EXPSHIFTBITS_DP64 +1
+    sub     %rcx, %r9                                 # expdiff
+
+##      if (expdiff > 15)
+    cmp     $15, %r9
+    jle     .Lexpdiffless15
+
+#          /* The remainder is pretty small compared with x, which
+#             implies that x is a near multiple of pi/2
+#             (x matches the multiple to at least 15 bits) */
+
+#          t  = rhead;
+    movsd   %xmm4, %xmm1
+
+#          rtail  = npi2 * piby2_2;
+    mulsd   %xmm2, %xmm3
+
+#          rhead  = t - rtail;
+    mulsd   %xmm2, %xmm5                              # npi2 * piby2_2tail
+    subsd   %xmm3, %xmm4                              # rhead
+
+#          rtail  = npi2 * piby2_2tail - ((t - rhead) - rtail);
+    subsd   %xmm4, %xmm1                              # t - rhead
+    subsd   %xmm3, %xmm1                              # -rtail
+    subsd   %xmm1, %xmm5                              # rtail
+
+#      r = rhead - rtail;
+    movsd   %xmm4, %xmm0
+
+#HARSHA
+#xmm1=rtail
+    movsd   %xmm5, %xmm1
+    subsd   %xmm5, %xmm0
+
+#    xmm0=r, xmm4=rhead, xmm1=rtail
+.Lexpdiffless15:
+#      region = npi2 & 3;
+
+    subsd   %xmm0, %xmm4                              # rhead-r
+    subsd   %xmm1, %xmm4                              # rr = (rhead-r) - rtail
+
+#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+## if the input was close to a pi/2 multiple
+# The original NAG code missed this trick.  If the input is very close to n*pi/2 after
+# reduction,
+# then the cos is ~ 1.0 , to within 53 bits, when r is < 2^-27.  We already
+# have x at this point, so we can skip the cos polynomials.
+
+    cmp     $0x03f2, %rcx                             # if r  small.
+    jge     .Lcos_piby4                               # use taylor series if not
+    cmp     $0x03de, %rcx                             # if r really small.
+    jle     .Lr_small                                 # then cos(r) = 1
+
+    movsd   %xmm0, %xmm2
+    mulsd   %xmm2, %xmm2                              # x^2
+
+##      if region is 1 or 3    do a sin calc.
+    and     %eax, %r8d
+    jz      .Lsinsmall
+
+# region 1 or 3
+# use simply polynomial
+#              *s = x - x*x*x*0.166666666666666666;
+    movsd   .L__real_3fc5555555555555(%rip), %xmm3    
+    mulsd   %xmm0, %xmm3                              # * x
+    mulsd   %xmm2, %xmm3                              # * x^2
+    subsd   %xmm3, %xmm0                              # xs
+    jmp     .Ladjust_region
+
+.align 16
+.Lsinsmall:
+# region 0 or 2
+#              cos = 1.0 - x*x*0.5;
+    movsd   .L__real_3ff0000000000000(%rip), %xmm0  # 1.0
+    mulsd   .L__real_3fe0000000000000(%rip), %xmm2  # 0.5 *x^2
+    subsd   %xmm2, %xmm0
+    jmp     .Ladjust_region
+
+.align 16
+.Lr_small:
+##      if region is 1 or 3    do a sin calc.
+    and     %eax, %r8d
+    jnz     .Ladjust_region
+
+    movsd   .L__real_3ff0000000000000(%rip), %xmm0    # cos(r) is a 1
+    jmp     .Ladjust_region
+
+.align 32
+.Lcos_reduce_precise:
+#      // Reduce x into range [-pi/4,pi/4]
+#      __amd_remainder_piby2(x, &r, &rr, &region);
+
+    lea     region(%rsp), %rdx
+    lea     rr(%rsp), %rsi
+    lea     r(%rsp), %rdi
+        
+    call    __amd_remainder_piby2@PLT
+
+    mov     $1, %r8d                                  # for determining region later on
+    movsd   r(%rsp), %xmm0                            # x
+    movsd   rr(%rsp), %xmm4                           # xx
+    mov     region(%rsp), %eax                        # region
+
+# xmm0 = x, xmm4 = xx, r8d = 1, eax= region
+#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+.align 32
+# perform taylor series to calc sinx, cosx
+.Lcos_piby4:
+#  x2 = r * r;
+
+#xmm4 = a part of rr for the sin path, xmm4 is overwritten in the cos path
+#instead use xmm3 because that was freed up in the sin path, xmm3 is overwritten in sin path
+    movsd   %xmm0, %xmm3
+    movsd   %xmm0, %xmm2
+    mulsd   %xmm0, %xmm2                              # x2
+
+##      if region is 1 or 3    do a sin calc.
+    and     %eax, %r8d
+    jz      .Lcospiby4
+
+# region 1 or 3
+    movsd   .Lsinarray+0x50(%rip), %xmm3              # s6
+    mulsd   %xmm2, %xmm3                              # x2s6
+    movsd   .Lsinarray+0x20(%rip), %xmm5              # s3
+    movsd   %xmm4,p_temp(%rsp)                        # store xx
+    movsd   %xmm2, %xmm1                              # move for x4
+    mulsd   %xmm2, %xmm1                              # x4
+    movsd   %xmm0,p_temp1(%rsp)                       # store x
+    mulsd   %xmm2, %xmm5                              # x2s3
+    movsd   %xmm0, %xmm4                              # move for x3
+    addsd   .Lsinarray+0x40(%rip), %xmm3              # s5+x2s6
+    mulsd   %xmm2, %xmm1                              # x6
+    mulsd   %xmm2, %xmm3                              # x2(s5+x2s6)
+    mulsd   %xmm2, %xmm4                              # x3
+    addsd   .Lsinarray+0x10(%rip), %xmm5              # s2+x2s3
+    mulsd   %xmm2, %xmm5                              # x2(s2+x2s3)
+    addsd   .Lsinarray+0x30(%rip), %xmm3              # s4 + x2(s5+x2s6)
+    mulsd   .L__real_3fe0000000000000(%rip), %xmm2    # 0.5 *x2
+    movsd   p_temp(%rsp), %xmm0                       # load xx
+    mulsd   %xmm1, %xmm3                              # x6(s4 + x2(s5+x2s6))
+    addsd   .Lsinarray(%rip), %xmm5                   # s1+x2(s2+x2s3)
+    mulsd   %xmm0, %xmm2                              # 0.5 * x2 *xx
+    addsd   %xmm5, %xmm3                              # zs
+    mulsd   %xmm3, %xmm4                              # *x3
+    subsd   %xmm2, %xmm4                              # x3*zs - 0.5 * x2 *xx
+    addsd   %xmm4, %xmm0                              # +xx
+    addsd   p_temp1(%rsp), %xmm0                      # +x
+    
+    jmp     .Ladjust_region
+
+#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+.align 16
+.Lcospiby4:
+    
+#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+# region 0 or 2     - do a cos calculation
+#  zc = (c2 + x2 * (c3 + x2 * (c4 + x2 * (c5 + x2 * c6))));
+    mulsd   %xmm0, %xmm4                              # x*xx
+    movsd   .L__real_3fe0000000000000(%rip), %xmm5
+    movsd   .Lcosarray+0x50(%rip), %xmm1              # c6
+    movsd   .Lcosarray+0x20(%rip), %xmm0              # c3
+    mulsd   %xmm2, %xmm5                              # r = 0.5 *x2
+    movsd   %xmm2, %xmm3                              # copy of x2
+    movsd   %xmm4,p_temp(%rsp)                        # store x*xx
+    mulsd   %xmm2, %xmm1                              # c6*x2
+    mulsd   %xmm2, %xmm0                              # c3*x2
+    subsd   .L__real_3ff0000000000000(%rip), %xmm5    # -t=r-1.0    ;trash r
+    mulsd   %xmm2, %xmm3                              # x4
+    addsd   .Lcosarray+0x40(%rip), %xmm1              # c5+x2c6
+    addsd   .Lcosarray+0x10(%rip), %xmm0              # c2+x2C3
+    addsd   .L__real_3ff0000000000000(%rip), %xmm5    # 1 + (-t)    ;trash t
+    mulsd   %xmm2, %xmm3                              # x6
+    mulsd   %xmm2, %xmm1                              # x2(c5+x2c6)
+    mulsd   %xmm2, %xmm0                              # x2(c2+x2C3)
+    movsd   %xmm2, %xmm4                              # copy of x2
+    mulsd   .L__real_3fe0000000000000(%rip), %xmm4    # r recalculate
+    addsd   .Lcosarray+0x30(%rip), %xmm1              # c4 + x2(c5+x2c6)
+    addsd   .Lcosarray(%rip), %xmm0                   # c1+x2(c2+x2C3)
+    mulsd   %xmm2, %xmm2                              # x4 recalculate
+    subsd   %xmm4, %xmm5                              # (1 + (-t)) - r
+    mulsd   %xmm3, %xmm1                              # x6(c4 + x2(c5+x2c6))
+    addsd   %xmm1, %xmm0                              # zc
+    subsd   .L__real_3ff0000000000000(%rip), %xmm4    # t relaculate
+    subsd   p_temp(%rsp), %xmm5                       # ((1 + (-t)) - r) - x*xx
+    mulsd   %xmm2, %xmm0                              # x4 * zc
+    addsd   %xmm5, %xmm0                              # x4 * zc + ((1 + (-t)) - r -x*xx)
+    subsd   %xmm4, %xmm0                              # result - (-t)
+
+#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+.align 32
+.Ladjust_region:        # positive or negative (0, 1, 2, 3)=>(1, 2, 3 ,4)=>(0, 2, 2,0)
+#      switch (region)
+    add     $1, %eax
+    and     $2, %eax
+    jz      .Lcos_cleanup
+## if the original region 1 or 2 then we negate the result.
+    movsd   %xmm0, %xmm2
+    xorpd   %xmm0, %xmm0
+    subsd   %xmm2, %xmm0
+
+.align 32
+.Lcos_cleanup:
+    add     $stack_size, %rsp
+    ret
+
+.align 32
+.Lcos_naninf:
+   call     fname_special
+   add      $stack_size, %rsp
+   ret
+
+
+
diff --git a/src/gas/cosf.S b/src/gas/cosf.S
new file mode 100644
index 0000000..43eae9a
--- /dev/null
+++ b/src/gas/cosf.S
@@ -0,0 +1,372 @@
+
+#
+#  (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+#  This file is part of libacml_mv.
+#
+#  libacml_mv is free software; you can redistribute it and/or
+#  modify it under the terms of the GNU Lesser General Public
+#  License as published by the Free Software Foundation; either
+#  version 2.1 of the License, or (at your option) any later version.
+#
+#  libacml_mv is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  Lesser General Public License for more details.
+#
+#  You should have received a copy of the GNU Lesser General Public
+#  License along with libacml_mv.  If not, see
+#  <http://www.gnu.org/licenses/>.
+#
+#
+
+
+# An implementation of the cosf function.
+#
+# Prototype:
+#
+#     float fastcosf(float x);
+#
+#   Computes cosf(x).  
+#   Based on the NAG C implementation.
+#   It will provide proper C99 return values,
+#   but may not raise floating point status bits properly.
+#   Author: Harsha Jagasia
+#   Email:  harsha.jagasia@amd.com
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.data
+.align 32
+.L__real_3ff0000000000000: .quad 0x03ff0000000000000  # 1.0
+                        .quad 0                       # for alignment
+.L__real_3fe0000000000000: .quad 0x03fe0000000000000  # 0.5
+                        .quad 0
+.L__real_3fc5555555555555: .quad 0x03fc5555555555555  # 0.166666666666
+                        .quad 0
+.L__real_3fe45f306dc9c883: .quad 0x03fe45f306dc9c883  # twobypi
+                        .quad 0
+.L__real_3FF921FB54442D18: .quad 0x03FF921FB54442D18  # piby2
+                        .quad 0
+.L__real_3ff921fb54400000: .quad 0x03ff921fb54400000  # piby2_1
+                        .quad 0
+.L__real_3dd0b4611a626331: .quad 0x03dd0b4611a626331  # piby2_1tail
+                        .quad 0
+.L__real_3dd0b4611a600000: .quad 0x03dd0b4611a600000  # piby2_2
+                        .quad 0
+.L__real_3ba3198a2e037073: .quad 0x03ba3198a2e037073  # piby2_2tail
+                        .quad 0                                         
+.L__real_411E848000000000: .quad 0x415312d000000000   # 5e6 0x0411E848000000000  # 5e5
+                        .quad 0
+                        
+.align 32
+.Lcsarray:
+    .quad    0x0bfc5555555555555                      # -0.166667        s1
+    .quad    0x03fa5555555555555                      # 0.0416667        c1
+    .quad    0x03f81111111110bb3                      # 0.00833333       s2
+    .quad    0x0bf56c16c16c16967                      # -0.00138889      c2
+    .quad    0x0bf2a01a019e83e5c                      # -0.000198413     s3
+    .quad    0x03efa01a019f4ec90                      # 2.48016e-005     c3
+    .quad    0x03ec71de3796cde01                      # 2.75573e-006     s4
+    .quad    0x0be927e4fa17f65f6                      # -2.75573e-007    c4
+
+.text
+.align 32
+.p2align 5,,31
+
+#include "fn_macros.h"
+#define fname FN_PROTOTYPE(cosf)
+#define fname_special _cosf_special@PLT
+
+# define local variable storage offsets
+.equ    p_temp,    0x30                               # temporary for get/put bits operation
+.equ    p_temp1,   0x40                               # temporary for get/put bits operation
+.equ    region,    0x50                               # pointer to region for amd_remainder_piby2
+.equ    r,         0x60                               # pointer to r for amd_remainder_piby2
+.equ   stack_size, 0x88
+
+.globl fname
+.type  fname,@function
+
+fname:
+
+    sub     $stack_size, %rsp
+ 
+##  if NaN or inf
+    movd    %xmm0, %edx
+    mov     $0x07f800000, %eax
+    mov     %eax, %r10d
+    and     %edx, %r10d
+    cmp     %eax, %r10d
+    jz      .Lcosf_naninf
+   
+    xorpd   %xmm2, %xmm2
+    mov     %rdx, %r11                                # save 1st return value pointer
+
+#  GET_BITS_DP64(x, ux);
+# convert input to double.
+    cvtss2sd    %xmm0, %xmm0
+
+# get the input value to an integer register.
+    movsd   %xmm0,p_temp(%rsp)
+    mov     p_temp(%rsp), %rdx                        # rdx is ux
+
+#  ax = (ux & ~SIGNBIT_DP64);
+    mov     $0x07fffffffffffffff, %r10
+    and     %rdx, %r10                                # r10 is ax
+
+    mov     $1, %r8d                                  # for determining region later on
+    movsd   %xmm0, %xmm1                              # copy x to xmm1
+
+
+##  if (ax <= 0x3fe921fb54442d18) /* abs(x) <= pi/4 */
+    mov     $0x03fe921fb54442d18, %rax
+    cmp     %rax, %r10
+    jg      .L__sc_reducec
+
+#          *c = cos_piby4(x, 0.0);
+    movsd   %xmm0, %xmm2 
+    mulsd   %xmm2, %xmm2                              # x^2
+    xor     %eax, %eax
+    mov     %r10, %rdx
+    movsd   .L__real_3fe0000000000000(%rip), %xmm5    # .5
+    jmp     .L__sc_piby4c       
+    
+.align 32
+.L__sc_reducec:    
+# reduce  the argument to be in a range from -pi/4 to +pi/4
+# by subtracting multiples of pi/2
+#  xneg = (ax != ux);
+    cmp     %r10, %rdx
+##  if (xneg) x = -x;
+    jz      .Lpositive
+    subsd   %xmm0, %xmm2
+    movsd   %xmm2, %xmm0
+
+.Lpositive:
+##  if (x < 5.0e5)
+    cmp     .L__real_411E848000000000(%rip), %r10
+    jae     .Lcosf_reduce_precise
+
+#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+# perform taylor series to calc cosx, cosx
+# xmm0=abs(x), xmm1=x
+.align 32
+.Lcosf_piby4:
+#/* How many pi/2 is x a multiple of? */
+#        npi2  = (int)(x * twobypi + 0.5);
+
+    movsd   %xmm0, %xmm2
+    movsd   %xmm0, %xmm4
+
+    mulsd   .L__real_3fe45f306dc9c883(%rip), %xmm2    # twobypi
+    movsd   .L__real_3fe0000000000000(%rip), %xmm5    # .5 
+
+#/* How many pi/2 is x a multiple of? */
+
+#      xexp  = ax >> EXPSHIFTBITS_DP64;
+    mov     %r10, %r9
+    shr     $52, %r9                                  # >> EXPSHIFTBITS_DP64
+
+#        npi2  = (int)(x * twobypi + 0.5);
+    addsd   %xmm5, %xmm2                              # npi2
+    
+    movsd   .L__real_3ff921fb54400000(%rip), %xmm3    # piby2_1
+    cvttpd2dq    %xmm2, %xmm0                         # convert to integer 
+    movsd   .L__real_3dd0b4611a626331(%rip), %xmm1    # piby2_1tail    
+    cvtdq2pd    %xmm0, %xmm2                          # and back to double    
+
+#      /* Subtract the multiple from x to get an extra-precision remainder */
+#      rhead  = x - npi2 * piby2_1;
+
+    mulsd   %xmm2, %xmm3                              # use piby2_1
+    subsd   %xmm3, %xmm4                              # rhead
+
+#      rtail  = npi2 * piby2_1tail;
+    mulsd   %xmm2, %xmm1                              # rtail
+    movd    %xmm0, %eax
+
+#      GET_BITS_DP64(rhead-rtail, uy);     
+    movsd   %xmm4, %xmm0
+    subsd   %xmm1, %xmm0
+
+    movsd   .L__real_3dd0b4611a600000(%rip), %xmm3    # piby2_2
+    movsd   .L__real_3ba3198a2e037073(%rip), %xmm5    # piby2_2tail
+    movd    %xmm0, %rcx                               # rcx is rhead-rtail
+
+#      expdiff = xexp - ((uy & EXPBITS_DP64) >> EXPSHIFTBITS_DP64);
+    shl     $1, %rcx                                  # strip any sign bit
+    shr     $53, %rcx                                 # >> EXPSHIFTBITS_DP64 +1
+    sub     %rcx, %r9                                 # expdiff
+
+##      if (expdiff > 15)
+    cmp     $15, %r9
+    jle     .Lexpdiffless15
+
+#          /* The remainder is pretty small compared with x, which
+#             implies that x is a near multiple of pi/2
+#             (x matches the multiple to at least 15 bits) */
+
+#          t  = rhead;
+    movsd   %xmm4, %xmm1
+
+#          rtail  = npi2 * piby2_2;
+    mulsd   %xmm2, %xmm3
+
+#          rhead  = t - rtail;
+    mulsd   %xmm2, %xmm5                              # npi2 * piby2_2tail
+    subsd   %xmm3, %xmm4                              # rhead
+
+#          rtail  = npi2 * piby2_2tail - ((t - rhead) - rtail);
+    subsd   %xmm4, %xmm1                              # t - rhead
+    subsd   %xmm3, %xmm1                              # -rtail
+    subsd   %xmm1, %xmm5                              # rtail
+
+#      r = rhead - rtail;
+    movsd   %xmm4, %xmm0
+
+#HARSHA
+#xmm1=rtail
+    movsd   %xmm5, %xmm1
+    subsd   %xmm5, %xmm0
+
+#    xmm0=r, xmm4=rhead, xmm1=rtail
+.Lexpdiffless15:
+#      region = npi2 & 3;
+
+    movsd   %xmm0, %xmm2 
+    mulsd   %xmm0, %xmm2        #x^2
+    movsd   %xmm0, %xmm1
+    movsd   .L__real_3fe0000000000000(%rip), %xmm5    # .5
+    
+    cmp     $0x03f2, %rcx                             # if r  small.
+    jge     .L__sc_piby4c                             # use taylor series if not
+    cmp     $0x03de, %rcx                             # if r really small.
+    jle     .L__rc_small                              # then cos(r) = 1
+
+##      if region is 1 or 3    do a sin calc.
+    and     %eax, %r8d
+    jz      .Lsinsmall
+# region 1 or 3
+# use simply polynomial
+#              *s = x - x*x*x*0.166666666666666666;
+    movsd   .L__real_3fc5555555555555(%rip), %xmm3    
+    mulsd   %xmm1, %xmm3                              # * x
+    mulsd   %xmm2, %xmm3                              # * x^2
+    subsd   %xmm3, %xmm1                              # xs
+    jmp     .L__adjust_region_cos
+    
+.align 16
+.Lsinsmall:
+# region 0 or 2
+#              cos = 1.0 - x*x*0.5;
+    movsd   .L__real_3ff0000000000000(%rip), %xmm1    # 1.0
+    mulsd   .L__real_3fe0000000000000(%rip), %xmm2    # 0.5 *x^2
+    subsd   %xmm2, %xmm1
+    jmp     .L__adjust_region_cos
+
+.align 16
+.L__rc_small:                                         # then sin(r) = r
+##     if region is 1 or 3   do a sin calc.
+   and      %eax, %r8d
+   jnz      .L__adjust_region_cos
+   movsd    .L__real_3ff0000000000000(%rip), %xmm1    # cos(r) is a 1 
+   jmp      .L__adjust_region_cos
+
+
+# done with reducing the argument.  Now perform the sin/cos calculations.
+.align 16
+.L__sc_piby4c:
+##     if region is 1 or 3   do a sin calc.
+   and      %eax, %r8d
+   jz       .Lcospiby4
+   
+   movsd    .Lcsarray+0x30(%rip), %xmm1               # c4
+   movsd    %xmm2, %xmm4
+   mulsd    %xmm2, %xmm1                              # x2c4
+   movsd    .Lcsarray+0x10(%rip), %xmm3               # c2
+   mulsd    %xmm4, %xmm4                              # x4   
+   mulsd    %xmm2, %xmm3                              # x2c2
+   mulsd    %xmm0, %xmm2                              # x3
+   addsd    .Lcsarray+0x20(%rip), %xmm1               # c3 + x2c4   
+   mulsd    %xmm4, %xmm1                              # x4(c3 + x2c4)
+   addsd    .Lcsarray(%rip), %xmm3                    # c1 + x2c2
+   addsd    %xmm3, %xmm1                              # c1 + c2x2 + c3x4 + c4x6
+   mulsd    %xmm2, %xmm1                              # c1x3 + c2x5 + c3x7 + c4x9
+   addsd    %xmm0, %xmm1                              # x + c1x3 + c2x5 + c3x7 + c4x9
+
+   jmp      .L__adjust_region_cos
+   
+.align 16
+.Lcospiby4:   
+# region 0 or 2    - do a cos calculation
+   movsd    .Lcsarray+0x38(%rip), %xmm1               # c4
+   movsd    %xmm2, %xmm4
+   mulsd    %xmm2, %xmm1                              # x2c4
+   movsd    .Lcsarray+0x18(%rip), %xmm3               # c2
+   mulsd    %xmm4, %xmm4                              # x4
+   mulsd    %xmm2, %xmm3                              # x2c2
+   mulsd    %xmm2, %xmm5                              # 0.5 * x2
+   addsd    .Lcsarray+0x28(%rip), %xmm1               # c3 + x2c4
+   mulsd    %xmm4, %xmm1                              # x4(c3 + x2c4)   
+   addsd    .Lcsarray+8(%rip), %xmm3                  # c1 + x2c2
+   addsd    %xmm3, %xmm1                              # c1 + x2c2 + c3x4 + c4x6
+   mulsd    %xmm4, %xmm1                              # x4(c1 + c2x2 + c3x4 + c4x6)
+
+#  -t = rc-1;
+   subsd    .L__real_3ff0000000000000(%rip), %xmm5    # 0.5x2 - 1
+   subsd    %xmm5, %xmm1                              # cos = 1 - 0.5x2 + c1x4 + c2x6 + c3x8 + c4x10
+
+.L__adjust_region_cos:                                # xmm1 is cos or sin, relies on previous sections to 
+#     switch (region)         
+   add      $1, %eax
+   and      $2, %eax
+   jz       .L__cos_cleanup
+## if region 1 or 2 then we negate the result.
+   xorpd    %xmm2, %xmm2
+   subsd    %xmm1, %xmm2
+   movsd    %xmm2, %xmm1
+
+.align 16   
+.L__cos_cleanup:
+   cvtsd2ss %xmm1, %xmm0
+   add      $stack_size, %rsp
+   ret
+
+.align 16
+#;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+.Lcosf_reduce_precise:
+#     /* Reduce abs(x) into range [-pi/4,pi/4] */
+#     __amd_remainder_piby2(ax, &r, &region);
+
+   mov      %rdx,p_temp(%rsp)                         # save ux for use later
+   mov      %r10,p_temp1(%rsp)                        # save ax for use later
+   movd     %xmm0, %rdi
+   lea      r(%rsp), %rsi
+   lea      region(%rsp), %rdx
+   sub      $0x020, %rsp   
+
+   call     __amd_remainder_piby2d2f@PLT
+
+   add      $0x020, %rsp
+   mov      p_temp(%rsp), %rdx                        # restore ux for use later
+   mov      p_temp1(%rsp), %r10                       # restore ax for use later   
+   mov      $1, %r8d                                  # for determining region later on
+   movsd    r(%rsp), %xmm0                            # r
+   mov      region(%rsp), %eax                        # region
+
+   movsd    %xmm0, %xmm2 
+   mulsd    %xmm0, %xmm2                              # x^2
+   movsd    %xmm0, %xmm1
+   movsd    .L__real_3fe0000000000000(%rip), %xmm5    # .5
+
+   jmp      .L__sc_piby4c
+
+.align 32
+.Lcosf_naninf:
+   call     fname_special
+   add      $stack_size, %rsp
+   ret
diff --git a/src/gas/exp.S b/src/gas/exp.S
new file mode 100644
index 0000000..153e8a6
--- /dev/null
+++ b/src/gas/exp.S
@@ -0,0 +1,400 @@
+#ifdef __ELF__
+.section .note.GNU-stack,"",@progbits
+#endif
+
+#ifdef __x86_64__
+#
+#  (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+#  This file is part of libacml_mv.
+#
+#  libacml_mv is free software; you can redistribute it and/or
+#  modify it under the terms of the GNU Lesser General Public
+#  License as published by the Free Software Foundation; either
+#  version 2.1 of the License, or (at your option) any later version.
+#
+#  libacml_mv is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  Lesser General Public License for more details.
+#
+#  You should have received a copy of the GNU Lesser General Public
+#  License along with libacml_mv.  If not, see
+#  <http://www.gnu.org/licenses/>.
+#
+#
+
+
+#
+# exp.S
+#
+# An implementation of the exp libm function.
+#
+# Prototype:
+#
+#     double exp(double x);
+#
+
+#
+#   Algorithm:
+#   
+#   e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
+#
+#   x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
+#   n = 64*m + j,   0 <= j < 64
+#   
+#   e^x = 2^((64*m + j + f)/64)
+#       = (2^m) * (2^(j/64)) * 2^(f/64)
+#       = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
+#
+#   f = x*(64/ln(2)) - n
+#   r = f*(ln(2)/64) = x - n*(ln(2)/64)
+#
+#   e^x = (2^m) * (2^(j/64)) * e^r
+#
+#   (2^(j/64)) is precomputed
+#
+#   e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! + (r^5)/5!
+#   e^r = 1 + q
+#
+#   q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5! + (r^5)/5!
+#
+
+#include "fn_macros.h"
+#define fname FN_PROTOTYPE(exp)
+#define fname_special _exp_special@PLT
+
+.text
+.p2align 4
+.globl fname
+.type fname,@function
+fname:
+    ucomisd      .L__max_exp_arg(%rip), %xmm0
+    ja           .L__y_is_inf
+    jp           .L__y_is_nan
+    ucomisd      .L__denormal_tiny_threshold(%rip), %xmm0
+    jbe          .L__y_is_zero
+
+    # x * (64/ln(2))
+    movapd      %xmm0,%xmm1        
+    mulsd       .L__real_64_by_log2(%rip), %xmm1
+
+    # n = int( x * (64/ln(2)) )
+    cvttpd2dq    %xmm1, %xmm2   #xmm2 = (int)n
+    cvtdq2pd    %xmm2, %xmm1   #xmm1 = (double)n
+    movd        %xmm2, %ecx
+    movapd     %xmm1,%xmm2
+    # r1 = x - n * ln(2)/64 head    
+    mulsd    .L__log2_by_64_mhead(%rip),%xmm1
+        
+    #j = n & 0x3f    
+    mov         $0x3f, %rax
+    and         %ecx, %eax     #eax = j
+    # m = (n - j) / 64      
+    sar         $6, %ecx       #ecx = m
+        
+
+    # r2 = - n * ln(2)/64 tail
+    mulsd    .L__log2_by_64_mtail(%rip),%xmm2
+    addsd    %xmm1,%xmm0   #xmm0 = r1
+
+    # r1+r2
+    addsd       %xmm0, %xmm2 #xmm2 = r
+
+    # q = r + r^2*1/2 + r^3*1/6 + r^4 *1/24 + r^5*1/120 + r^6*1/720
+    # q = r + r*r*(1/2 + r*(1/6+ r*(1/24 + r*(1/120 + r*(1/720)))))
+    movapd       .L__real_1_by_720(%rip), %xmm3  #xmm3 = 1/720
+    mulsd       %xmm2, %xmm3    #xmm3 = r*1/720
+    movapd       .L__real_1_by_6(%rip), %xmm0    #xmm0 = 1/6    
+    movapd      %xmm2, %xmm1 #xmm1 = r            
+    mulsd       %xmm2, %xmm0    #xmm0 = r*1/6
+    addsd       .L__real_1_by_120(%rip), %xmm3  #xmm3 = 1/120 + (r*1/720)
+    mulsd       %xmm2, %xmm1    #xmm1 = r*r    
+    addsd       .L__real_1_by_2(%rip), %xmm0  #xmm0 = 1/2 + (r*1/6)        
+    movapd       %xmm1, %xmm4   #xmm4 = r*r
+    mulsd       %xmm1, %xmm4    #xmm4 = (r*r) * (r*r)    
+    mulsd       %xmm2, %xmm3    #xmm3 = r * (1/120 + (r*1/720))
+    mulsd       %xmm1, %xmm0    #xmm0 = (r*r)*(1/2 + (r*1/6))
+    addsd       .L__real_1_by_24(%rip), %xmm3  #xmm3 = 1/24 + (r * (1/120 + (r*1/720)))
+    addsd       %xmm2, %xmm0   #xmm0 = r + ((r*r)*(1/2 + (r*1/6)))
+    mulsd       %xmm4, %xmm3   #xmm3 = ((r*r) * (r*r)) * (1/24 + (r * (1/120 + (r*1/720))))
+    addsd       %xmm3, %xmm0   #xmm0 = r + ((r*r)*(1/2 + (r*1/6))) + ((r*r) * (r*r)) * (1/24 + (r * (1/120 + (r*1/720))))
+    
+    # (f)*(q) + f2 + f1
+    cmp         $0xfffffc02, %ecx # -1022    
+    lea         .L__two_to_jby64_table(%rip), %rdx        
+    lea         .L__two_to_jby64_tail_table(%rip), %r11       
+    lea         .L__two_to_jby64_head_table(%rip), %r10      
+    mulsd       (%rdx,%rax,8), %xmm0
+    addsd       (%r11,%rax,8), %xmm0
+    addsd       (%r10,%rax,8), %xmm0        
+
+    jle         .L__process_denormal 
+.L__process_normal:
+    shl         $52, %rcx    
+    movd        %rcx,%xmm2
+    paddq       %xmm2, %xmm0
+    ret
+
+.p2align 4
+.L__process_denormal:
+    jl          .L__process_true_denormal
+    ucomisd     .L__real_one(%rip), %xmm0
+    jae         .L__process_normal
+.L__process_true_denormal:
+    # here ( e^r < 1 and m = -1022 ) or m <= -1023
+    add         $1074, %ecx
+    mov         $1, %rax    
+    shl         %cl, %rax
+    movd         %rax, %xmm2
+    mulsd       %xmm2, %xmm0
+    ret        
+    
+.p2align 4
+.L__y_is_inf:
+    mov         $0x7ff0000000000000,%rax
+    movd       %rax, %xmm1
+    mov         $3, %edi
+    jmp         fname_special
+
+.p2align 4
+.L__y_is_nan:
+    movapd      %xmm0,%xmm1
+    addsd       %xmm0,%xmm1
+    mov         $1, %edi
+    jmp         fname_special
+
+.p2align 4
+.L__y_is_zero:
+    ucomisd     .L__min_exp_arg(%rip),%xmm0
+    jbe          .L__return_zero
+    movapd       .L__real_smallest_denormal(%rip), %xmm0
+    ret
+    
+.p2align 4        
+.L__return_zero:    
+    pxor        %xmm1,%xmm1
+    mov         $2, %edi
+    jmp         fname_special
+    
+.data
+.align 16
+.L__max_exp_arg:            .quad 0x40862e42fefa39ef
+.L__denormal_tiny_threshold:  .quad 0xc0874046dfefd9d0
+.L__min_exp_arg:            .quad 0xc0874910d52d3051
+.L__real_64_by_log2:        .quad 0x40571547652b82fe    # 64/ln(2)
+
+.align 16
+.L__log2_by_64_mhead: .quad 0xbf862e42fefa0000
+.L__log2_by_64_mtail: .quad 0xbd1cf79abc9e3b39
+.L__real_1_by_720:              .quad 0x3f56c16c16c16c17    # 1/720
+.L__real_1_by_120:              .quad 0x3f81111111111111    # 1/120
+.L__real_1_by_6:                .quad 0x3fc5555555555555    # 1/6
+.L__real_1_by_2:                .quad 0x3fe0000000000000    # 1/2
+.L__real_1_by_24:               .quad 0x3fa5555555555555    # 1/24
+.L__real_one:                   .quad 0x3ff0000000000000
+.L__real_smallest_denormal:     .quad 0x0000000000000001
+
+
+.align 16
+.L__two_to_jby64_table:
+    .quad 0x3ff0000000000000
+    .quad 0x3ff02c9a3e778061
+    .quad 0x3ff059b0d3158574
+    .quad 0x3ff0874518759bc8
+    .quad 0x3ff0b5586cf9890f
+    .quad 0x3ff0e3ec32d3d1a2
+    .quad 0x3ff11301d0125b51
+    .quad 0x3ff1429aaea92de0
+    .quad 0x3ff172b83c7d517b
+    .quad 0x3ff1a35beb6fcb75
+    .quad 0x3ff1d4873168b9aa
+    .quad 0x3ff2063b88628cd6
+    .quad 0x3ff2387a6e756238
+    .quad 0x3ff26b4565e27cdd
+    .quad 0x3ff29e9df51fdee1
+    .quad 0x3ff2d285a6e4030b
+    .quad 0x3ff306fe0a31b715
+    .quad 0x3ff33c08b26416ff
+    .quad 0x3ff371a7373aa9cb
+    .quad 0x3ff3a7db34e59ff7
+    .quad 0x3ff3dea64c123422
+    .quad 0x3ff4160a21f72e2a
+    .quad 0x3ff44e086061892d
+    .quad 0x3ff486a2b5c13cd0
+    .quad 0x3ff4bfdad5362a27
+    .quad 0x3ff4f9b2769d2ca7
+    .quad 0x3ff5342b569d4f82
+    .quad 0x3ff56f4736b527da
+    .quad 0x3ff5ab07dd485429
+    .quad 0x3ff5e76f15ad2148
+    .quad 0x3ff6247eb03a5585
+    .quad 0x3ff6623882552225
+    .quad 0x3ff6a09e667f3bcd
+    .quad 0x3ff6dfb23c651a2f
+    .quad 0x3ff71f75e8ec5f74
+    .quad 0x3ff75feb564267c9
+    .quad 0x3ff7a11473eb0187
+    .quad 0x3ff7e2f336cf4e62
+    .quad 0x3ff82589994cce13
+    .quad 0x3ff868d99b4492ed
+    .quad 0x3ff8ace5422aa0db
+    .quad 0x3ff8f1ae99157736
+    .quad 0x3ff93737b0cdc5e5
+    .quad 0x3ff97d829fde4e50
+    .quad 0x3ff9c49182a3f090
+    .quad 0x3ffa0c667b5de565
+    .quad 0x3ffa5503b23e255d
+    .quad 0x3ffa9e6b5579fdbf
+    .quad 0x3ffae89f995ad3ad
+    .quad 0x3ffb33a2b84f15fb
+    .quad 0x3ffb7f76f2fb5e47
+    .quad 0x3ffbcc1e904bc1d2
+    .quad 0x3ffc199bdd85529c
+    .quad 0x3ffc67f12e57d14b
+    .quad 0x3ffcb720dcef9069
+    .quad 0x3ffd072d4a07897c
+    .quad 0x3ffd5818dcfba487
+    .quad 0x3ffda9e603db3285
+    .quad 0x3ffdfc97337b9b5f
+    .quad 0x3ffe502ee78b3ff6
+    .quad 0x3ffea4afa2a490da
+    .quad 0x3ffefa1bee615a27
+    .quad 0x3fff50765b6e4540
+    .quad 0x3fffa7c1819e90d8
+
+.align 16
+.L__two_to_jby64_head_table:
+    .quad 0x3ff0000000000000
+    .quad 0x3ff02c9a30000000
+    .quad 0x3ff059b0d0000000
+    .quad 0x3ff0874510000000
+    .quad 0x3ff0b55860000000
+    .quad 0x3ff0e3ec30000000
+    .quad 0x3ff11301d0000000
+    .quad 0x3ff1429aa0000000
+    .quad 0x3ff172b830000000
+    .quad 0x3ff1a35be0000000
+    .quad 0x3ff1d48730000000
+    .quad 0x3ff2063b80000000
+    .quad 0x3ff2387a60000000
+    .quad 0x3ff26b4560000000
+    .quad 0x3ff29e9df0000000
+    .quad 0x3ff2d285a0000000
+    .quad 0x3ff306fe00000000
+    .quad 0x3ff33c08b0000000
+    .quad 0x3ff371a730000000
+    .quad 0x3ff3a7db30000000
+    .quad 0x3ff3dea640000000
+    .quad 0x3ff4160a20000000
+    .quad 0x3ff44e0860000000
+    .quad 0x3ff486a2b0000000
+    .quad 0x3ff4bfdad0000000
+    .quad 0x3ff4f9b270000000
+    .quad 0x3ff5342b50000000
+    .quad 0x3ff56f4730000000
+    .quad 0x3ff5ab07d0000000
+    .quad 0x3ff5e76f10000000
+    .quad 0x3ff6247eb0000000
+    .quad 0x3ff6623880000000
+    .quad 0x3ff6a09e60000000
+    .quad 0x3ff6dfb230000000
+    .quad 0x3ff71f75e0000000
+    .quad 0x3ff75feb50000000
+    .quad 0x3ff7a11470000000
+    .quad 0x3ff7e2f330000000
+    .quad 0x3ff8258990000000
+    .quad 0x3ff868d990000000
+    .quad 0x3ff8ace540000000
+    .quad 0x3ff8f1ae90000000
+    .quad 0x3ff93737b0000000
+    .quad 0x3ff97d8290000000
+    .quad 0x3ff9c49180000000
+    .quad 0x3ffa0c6670000000
+    .quad 0x3ffa5503b0000000
+    .quad 0x3ffa9e6b50000000
+    .quad 0x3ffae89f90000000
+    .quad 0x3ffb33a2b0000000
+    .quad 0x3ffb7f76f0000000
+    .quad 0x3ffbcc1e90000000
+    .quad 0x3ffc199bd0000000
+    .quad 0x3ffc67f120000000
+    .quad 0x3ffcb720d0000000
+    .quad 0x3ffd072d40000000
+    .quad 0x3ffd5818d0000000
+    .quad 0x3ffda9e600000000
+    .quad 0x3ffdfc9730000000
+    .quad 0x3ffe502ee0000000
+    .quad 0x3ffea4afa0000000
+    .quad 0x3ffefa1be0000000
+    .quad 0x3fff507650000000
+    .quad 0x3fffa7c180000000
+
+.align 16
+.L__two_to_jby64_tail_table:
+    .quad 0x0000000000000000
+    .quad 0x3e6cef00c1dcdef9
+    .quad 0x3e48ac2ba1d73e2a
+    .quad 0x3e60eb37901186be
+    .quad 0x3e69f3121ec53172
+    .quad 0x3e469e8d10103a17
+    .quad 0x3df25b50a4ebbf1a
+    .quad 0x3e6d525bbf668203
+    .quad 0x3e68faa2f5b9bef9
+    .quad 0x3e66df96ea796d31
+    .quad 0x3e368b9aa7805b80
+    .quad 0x3e60c519ac771dd6
+    .quad 0x3e6ceac470cd83f5
+    .quad 0x3e5789f37495e99c
+    .quad 0x3e547f7b84b09745
+    .quad 0x3e5b900c2d002475
+    .quad 0x3e64636e2a5bd1ab
+    .quad 0x3e4320b7fa64e430
+    .quad 0x3e5ceaa72a9c5154
+    .quad 0x3e53967fdba86f24
+    .quad 0x3e682468446b6824
+    .quad 0x3e3f72e29f84325b
+    .quad 0x3e18624b40c4dbd0
+    .quad 0x3e5704f3404f068e
+    .quad 0x3e54d8a89c750e5e
+    .quad 0x3e5a74b29ab4cf62
+    .quad 0x3e5a753e077c2a0f
+    .quad 0x3e5ad49f699bb2c0
+    .quad 0x3e6a90a852b19260
+    .quad 0x3e56b48521ba6f93
+    .quad 0x3e0d2ac258f87d03
+    .quad 0x3e42a91124893ecf
+    .quad 0x3e59fcef32422cbe
+    .quad 0x3e68ca345de441c5
+    .quad 0x3e61d8bee7ba46e1
+    .quad 0x3e59099f22fdba6a
+    .quad 0x3e4f580c36bea881
+    .quad 0x3e5b3d398841740a
+    .quad 0x3e62999c25159f11
+    .quad 0x3e668925d901c83b
+    .quad 0x3e415506dadd3e2a
+    .quad 0x3e622aee6c57304e
+    .quad 0x3e29b8bc9e8a0387
+    .quad 0x3e6fbc9c9f173d24
+    .quad 0x3e451f8480e3e235
+    .quad 0x3e66bbcac96535b5
+    .quad 0x3e41f12ae45a1224
+    .quad 0x3e55e7f6fd0fac90
+    .quad 0x3e62b5a75abd0e69
+    .quad 0x3e609e2bf5ed7fa1
+    .quad 0x3e47daf237553d84
+    .quad 0x3e12f074891ee83d
+    .quad 0x3e6b0aa538444196
+    .quad 0x3e6cafa29694426f
+    .quad 0x3e69df20d22a0797
+    .quad 0x3e640f12f71a1e45
+    .quad 0x3e69f7490e4bb40b
+    .quad 0x3e4ed9942b84600d
+    .quad 0x3e4bdcdaf5cb4656
+    .quad 0x3e5e2cffd89cf44c
+    .quad 0x3e452486cc2c7b9d
+    .quad 0x3e6cc2b44eee3fa4
+    .quad 0x3e66dc8a80ce9f09
+    .quad 0x3e39e90d82e90a7e
+
+#endif
diff --git a/src/gas/exp10.S b/src/gas/exp10.S
new file mode 100644
index 0000000..009bbe0
--- /dev/null
+++ b/src/gas/exp10.S
@@ -0,0 +1,366 @@
+
+#
+#  (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+#  This file is part of libacml_mv.
+#
+#  libacml_mv is free software; you can redistribute it and/or
+#  modify it under the terms of the GNU Lesser General Public
+#  License as published by the Free Software Foundation; either
+#  version 2.1 of the License, or (at your option) any later version.
+#
+#  libacml_mv is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  Lesser General Public License for more details.
+#
+#  You should have received a copy of the GNU Lesser General Public
+#  License along with libacml_mv.  If not, see
+#  <http://www.gnu.org/licenses/>.
+#
+#
+
+
+#include "fn_macros.h"
+#define fname FN_PROTOTYPE(exp10)
+#define fname_special _exp10_special@PLT
+#ifdef __ELF__
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.p2align 4
+.globl fname
+.type fname,@function
+fname:
+    ucomisd      .L__max_exp10_arg(%rip), %xmm0
+    jae          .L__y_is_inf
+    jp           .L__y_is_nan
+    ucomisd      .L__min_exp10_arg(%rip), %xmm0
+    jbe          .L__y_is_zero
+
+    # x * (64/log10(2))
+    movapd      %xmm0,%xmm1        
+    mulsd       .L__real_64_by_log10of2(%rip), %xmm1
+
+    # n = int( x * (64/log10(2)) )
+    cvttpd2dq    %xmm1, %xmm2   #xmm2 = (int)n
+    cvtdq2pd    %xmm2, %xmm1   #xmm1 = (double)n
+    movd        %xmm2, %ecx
+    movapd     %xmm1,%xmm2
+    # r1 = x - n * log10(2)/64 head    
+    mulsd    .L__log10of2_by_64_mhead(%rip),%xmm1
+        
+    #j = n & 0x3f    
+    mov         $0x3f, %rax
+    and         %ecx, %eax     #eax = j
+    # m = (n - j) / 64      
+    sar         $6, %ecx       #ecx = m        
+
+    # r2 = - n * log10(2)/64 tail
+    mulsd    .L__log10of2_by_64_mtail(%rip),%xmm2 #xmm2 = r2
+    addsd    %xmm1,%xmm0   #xmm0 = r1
+    
+    # r1 *= ln10;
+    # r2 *= ln10;
+    mulsd   .L__ln10(%rip),%xmm0
+    mulsd   .L__ln10(%rip),%xmm2
+
+    # r1+r2
+    addsd       %xmm0, %xmm2 #xmm2 = r
+
+    # q = r + r^2*1/2 + r^3*1/6 + r^4 *1/24 + r^5*1/120 + r^6*1/720
+    # q = r + r*r*(1/2 + r*(1/6+ r*(1/24 + r*(1/120 + r*(1/720)))))
+    movapd       .L__real_1_by_720(%rip), %xmm3  #xmm3 = 1/720
+    mulsd       %xmm2, %xmm3    #xmm3 = r*1/720
+    movapd       .L__real_1_by_6(%rip), %xmm0    #xmm0 = 1/6    
+    movapd      %xmm2, %xmm1 #xmm1 = r            
+    mulsd       %xmm2, %xmm0    #xmm0 = r*1/6
+    addsd       .L__real_1_by_120(%rip), %xmm3  #xmm3 = 1/120 + (r*1/720)
+    mulsd       %xmm2, %xmm1    #xmm1 = r*r    
+    addsd       .L__real_1_by_2(%rip), %xmm0  #xmm0 = 1/2 + (r*1/6)        
+    movapd       %xmm1, %xmm4   #xmm4 = r*r
+    mulsd       %xmm1, %xmm4    #xmm4 = (r*r) * (r*r)    
+    mulsd       %xmm2, %xmm3    #xmm3 = r * (1/120 + (r*1/720))
+    mulsd       %xmm1, %xmm0    #xmm0 = (r*r)*(1/2 + (r*1/6))
+    addsd       .L__real_1_by_24(%rip), %xmm3  #xmm3 = 1/24 + (r * (1/120 + (r*1/720)))
+    addsd       %xmm2, %xmm0   #xmm0 = r + ((r*r)*(1/2 + (r*1/6)))
+    mulsd       %xmm4, %xmm3   #xmm3 = ((r*r) * (r*r)) * (1/24 + (r * (1/120 + (r*1/720))))
+    addsd       %xmm3, %xmm0   #xmm0 = r + ((r*r)*(1/2 + (r*1/6))) + ((r*r) * (r*r)) * (1/24 + (r * (1/120 + (r*1/720))))
+    
+    # (f)*(q) + f2 + f1
+    cmp         $0xfffffc02, %ecx # -1022    
+    lea         .L__two_to_jby64_table(%rip), %rdx        
+    lea         .L__two_to_jby64_tail_table(%rip), %r11       
+    lea         .L__two_to_jby64_head_table(%rip), %r10      
+    mulsd       (%rdx,%rax,8), %xmm0
+    addsd       (%r11,%rax,8), %xmm0
+    addsd       (%r10,%rax,8), %xmm0        
+
+    jle         .L__process_denormal 
+.L__process_normal:
+    shl         $52, %rcx    
+    movd        %rcx,%xmm2
+    paddq       %xmm2, %xmm0
+    ret
+
+.p2align 4
+.L__process_denormal:
+    jl          .L__process_true_denormal
+    ucomisd     .L__real_one(%rip), %xmm0
+    jae         .L__process_normal
+.L__process_true_denormal:
+    # here ( e^r < 1 and m = -1022 ) or m <= -1023
+    add         $1074, %ecx
+    mov         $1, %rax    
+    shl         %cl, %rax
+    movd         %rax, %xmm2
+    mulsd       %xmm2, %xmm0
+    ret        
+    
+.p2align 4
+.L__y_is_inf:
+    mov         $0x7ff0000000000000,%rax
+    movd       %rax, %xmm1
+    mov         $3, %edi
+    #call        fname_special
+    movdqa %xmm1,%xmm0 #remove this if call is made
+    ret     
+
+.p2align 4
+.L__y_is_nan:
+    movapd      %xmm0,%xmm1
+    addsd       %xmm0,%xmm1
+    mov         $1, %edi
+    #call        fname_special
+    movdqa %xmm1,%xmm0 #remove this if call is made    
+    ret
+
+.p2align 4
+.L__y_is_zero:
+    pxor        %xmm1,%xmm1
+    mov         $2, %edi
+    #call        fname_special
+    movdqa %xmm1,%xmm0 #remove this if call is made    
+    ret      
+    
+.data
+.align 16
+.L__max_exp10_arg:          .quad 0x40734413509f79ff
+.L__min_exp10_arg:            .quad 0xc07434e6420f4374
+.L__real_64_by_log10of2:    .quad 0x406A934F0979A371    # 64/log10(2)
+.L__ln10:                   .quad 0x40026BB1BBB55516
+
+.align 16
+.L__log10of2_by_64_mhead: .quad 0xbF73441350000000
+.L__log10of2_by_64_mtail: .quad 0xbda3ef3fde623e25
+.L__real_1_by_720:              .quad 0x3f56c16c16c16c17    # 1/720
+.L__real_1_by_120:              .quad 0x3f81111111111111    # 1/120
+.L__real_1_by_6:                .quad 0x3fc5555555555555    # 1/6
+.L__real_1_by_2:                .quad 0x3fe0000000000000    # 1/2
+.L__real_1_by_24:               .quad 0x3fa5555555555555    # 1/24
+.L__real_one:                   .quad 0x3ff0000000000000
+
+.align 16
+.L__two_to_jby64_table:
+    .quad 0x3ff0000000000000
+    .quad 0x3ff02c9a3e778061
+    .quad 0x3ff059b0d3158574
+    .quad 0x3ff0874518759bc8
+    .quad 0x3ff0b5586cf9890f
+    .quad 0x3ff0e3ec32d3d1a2
+    .quad 0x3ff11301d0125b51
+    .quad 0x3ff1429aaea92de0
+    .quad 0x3ff172b83c7d517b
+    .quad 0x3ff1a35beb6fcb75
+    .quad 0x3ff1d4873168b9aa
+    .quad 0x3ff2063b88628cd6
+    .quad 0x3ff2387a6e756238
+    .quad 0x3ff26b4565e27cdd
+    .quad 0x3ff29e9df51fdee1
+    .quad 0x3ff2d285a6e4030b
+    .quad 0x3ff306fe0a31b715
+    .quad 0x3ff33c08b26416ff
+    .quad 0x3ff371a7373aa9cb
+    .quad 0x3ff3a7db34e59ff7
+    .quad 0x3ff3dea64c123422
+    .quad 0x3ff4160a21f72e2a
+    .quad 0x3ff44e086061892d
+    .quad 0x3ff486a2b5c13cd0
+    .quad 0x3ff4bfdad5362a27
+    .quad 0x3ff4f9b2769d2ca7
+    .quad 0x3ff5342b569d4f82
+    .quad 0x3ff56f4736b527da
+    .quad 0x3ff5ab07dd485429
+    .quad 0x3ff5e76f15ad2148
+    .quad 0x3ff6247eb03a5585
+    .quad 0x3ff6623882552225
+    .quad 0x3ff6a09e667f3bcd
+    .quad 0x3ff6dfb23c651a2f
+    .quad 0x3ff71f75e8ec5f74
+    .quad 0x3ff75feb564267c9
+    .quad 0x3ff7a11473eb0187
+    .quad 0x3ff7e2f336cf4e62
+    .quad 0x3ff82589994cce13
+    .quad 0x3ff868d99b4492ed
+    .quad 0x3ff8ace5422aa0db
+    .quad 0x3ff8f1ae99157736
+    .quad 0x3ff93737b0cdc5e5
+    .quad 0x3ff97d829fde4e50
+    .quad 0x3ff9c49182a3f090
+    .quad 0x3ffa0c667b5de565
+    .quad 0x3ffa5503b23e255d
+    .quad 0x3ffa9e6b5579fdbf
+    .quad 0x3ffae89f995ad3ad
+    .quad 0x3ffb33a2b84f15fb
+    .quad 0x3ffb7f76f2fb5e47
+    .quad 0x3ffbcc1e904bc1d2
+    .quad 0x3ffc199bdd85529c
+    .quad 0x3ffc67f12e57d14b
+    .quad 0x3ffcb720dcef9069
+    .quad 0x3ffd072d4a07897c
+    .quad 0x3ffd5818dcfba487
+    .quad 0x3ffda9e603db3285
+    .quad 0x3ffdfc97337b9b5f
+    .quad 0x3ffe502ee78b3ff6
+    .quad 0x3ffea4afa2a490da
+    .quad 0x3ffefa1bee615a27
+    .quad 0x3fff50765b6e4540
+    .quad 0x3fffa7c1819e90d8
+
+.align 16
+.L__two_to_jby64_head_table:
+    .quad 0x3ff0000000000000
+    .quad 0x3ff02c9a30000000
+    .quad 0x3ff059b0d0000000
+    .quad 0x3ff0874510000000
+    .quad 0x3ff0b55860000000
+    .quad 0x3ff0e3ec30000000
+    .quad 0x3ff11301d0000000
+    .quad 0x3ff1429aa0000000
+    .quad 0x3ff172b830000000
+    .quad 0x3ff1a35be0000000
+    .quad 0x3ff1d48730000000
+    .quad 0x3ff2063b80000000
+    .quad 0x3ff2387a60000000
+    .quad 0x3ff26b4560000000
+    .quad 0x3ff29e9df0000000
+    .quad 0x3ff2d285a0000000
+    .quad 0x3ff306fe00000000
+    .quad 0x3ff33c08b0000000
+    .quad 0x3ff371a730000000
+    .quad 0x3ff3a7db30000000
+    .quad 0x3ff3dea640000000
+    .quad 0x3ff4160a20000000
+    .quad 0x3ff44e0860000000
+    .quad 0x3ff486a2b0000000
+    .quad 0x3ff4bfdad0000000
+    .quad 0x3ff4f9b270000000
+    .quad 0x3ff5342b50000000
+    .quad 0x3ff56f4730000000
+    .quad 0x3ff5ab07d0000000
+    .quad 0x3ff5e76f10000000
+    .quad 0x3ff6247eb0000000
+    .quad 0x3ff6623880000000
+    .quad 0x3ff6a09e60000000
+    .quad 0x3ff6dfb230000000
+    .quad 0x3ff71f75e0000000
+    .quad 0x3ff75feb50000000
+    .quad 0x3ff7a11470000000
+    .quad 0x3ff7e2f330000000
+    .quad 0x3ff8258990000000
+    .quad 0x3ff868d990000000
+    .quad 0x3ff8ace540000000
+    .quad 0x3ff8f1ae90000000
+    .quad 0x3ff93737b0000000
+    .quad 0x3ff97d8290000000
+    .quad 0x3ff9c49180000000
+    .quad 0x3ffa0c6670000000
+    .quad 0x3ffa5503b0000000
+    .quad 0x3ffa9e6b50000000
+    .quad 0x3ffae89f90000000
+    .quad 0x3ffb33a2b0000000
+    .quad 0x3ffb7f76f0000000
+    .quad 0x3ffbcc1e90000000
+    .quad 0x3ffc199bd0000000
+    .quad 0x3ffc67f120000000
+    .quad 0x3ffcb720d0000000
+    .quad 0x3ffd072d40000000
+    .quad 0x3ffd5818d0000000
+    .quad 0x3ffda9e600000000
+    .quad 0x3ffdfc9730000000
+    .quad 0x3ffe502ee0000000
+    .quad 0x3ffea4afa0000000
+    .quad 0x3ffefa1be0000000
+    .quad 0x3fff507650000000
+    .quad 0x3fffa7c180000000
+
+.align 16
+.L__two_to_jby64_tail_table:
+    .quad 0x0000000000000000
+    .quad 0x3e6cef00c1dcdef9
+    .quad 0x3e48ac2ba1d73e2a
+    .quad 0x3e60eb37901186be
+    .quad 0x3e69f3121ec53172
+    .quad 0x3e469e8d10103a17
+    .quad 0x3df25b50a4ebbf1a
+    .quad 0x3e6d525bbf668203
+    .quad 0x3e68faa2f5b9bef9
+    .quad 0x3e66df96ea796d31
+    .quad 0x3e368b9aa7805b80
+    .quad 0x3e60c519ac771dd6
+    .quad 0x3e6ceac470cd83f5
+    .quad 0x3e5789f37495e99c
+    .quad 0x3e547f7b84b09745
+    .quad 0x3e5b900c2d002475
+    .quad 0x3e64636e2a5bd1ab
+    .quad 0x3e4320b7fa64e430
+    .quad 0x3e5ceaa72a9c5154
+    .quad 0x3e53967fdba86f24
+    .quad 0x3e682468446b6824
+    .quad 0x3e3f72e29f84325b
+    .quad 0x3e18624b40c4dbd0
+    .quad 0x3e5704f3404f068e
+    .quad 0x3e54d8a89c750e5e
+    .quad 0x3e5a74b29ab4cf62
+    .quad 0x3e5a753e077c2a0f
+    .quad 0x3e5ad49f699bb2c0
+    .quad 0x3e6a90a852b19260
+    .quad 0x3e56b48521ba6f93
+    .quad 0x3e0d2ac258f87d03
+    .quad 0x3e42a91124893ecf
+    .quad 0x3e59fcef32422cbe
+    .quad 0x3e68ca345de441c5
+    .quad 0x3e61d8bee7ba46e1
+    .quad 0x3e59099f22fdba6a
+    .quad 0x3e4f580c36bea881
+    .quad 0x3e5b3d398841740a
+    .quad 0x3e62999c25159f11
+    .quad 0x3e668925d901c83b
+    .quad 0x3e415506dadd3e2a
+    .quad 0x3e622aee6c57304e
+    .quad 0x3e29b8bc9e8a0387
+    .quad 0x3e6fbc9c9f173d24
+    .quad 0x3e451f8480e3e235
+    .quad 0x3e66bbcac96535b5
+    .quad 0x3e41f12ae45a1224
+    .quad 0x3e55e7f6fd0fac90
+    .quad 0x3e62b5a75abd0e69
+    .quad 0x3e609e2bf5ed7fa1
+    .quad 0x3e47daf237553d84
+    .quad 0x3e12f074891ee83d
+    .quad 0x3e6b0aa538444196
+    .quad 0x3e6cafa29694426f
+    .quad 0x3e69df20d22a0797
+    .quad 0x3e640f12f71a1e45
+    .quad 0x3e69f7490e4bb40b
+    .quad 0x3e4ed9942b84600d
+    .quad 0x3e4bdcdaf5cb4656
+    .quad 0x3e5e2cffd89cf44c
+    .quad 0x3e452486cc2c7b9d
+    .quad 0x3e6cc2b44eee3fa4
+    .quad 0x3e66dc8a80ce9f09
+    .quad 0x3e39e90d82e90a7e
+
+
+
diff --git a/src/gas/exp10f.S b/src/gas/exp10f.S
new file mode 100644
index 0000000..da805e2
--- /dev/null
+++ b/src/gas/exp10f.S
@@ -0,0 +1,191 @@
+
+#
+#  (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+#  This file is part of libacml_mv.
+#
+#  libacml_mv is free software; you can redistribute it and/or
+#  modify it under the terms of the GNU Lesser General Public
+#  License as published by the Free Software Foundation; either
+#  version 2.1 of the License, or (at your option) any later version.
+#
+#  libacml_mv is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  Lesser General Public License for more details.
+#
+#  You should have received a copy of the GNU Lesser General Public
+#  License along with libacml_mv.  If not, see
+#  <http://www.gnu.org/licenses/>.
+#
+#
+
+
+#include "fn_macros.h"
+#define fname FN_PROTOTYPE(exp10f)
+#define fname_special _exp10f_special@PLT
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.p2align 4
+.globl fname
+.type fname,@function
+fname:
+    ucomiss .L__max_exp_arg(%rip), %xmm0
+    ja .L__y_is_inf
+    jp .L__y_is_nan
+    ucomiss .L__min_exp_arg(%rip), %xmm0
+    jb .L__y_is_zero
+
+    cvtps2pd     %xmm0, %xmm0    #xmm0 = (double)x
+
+    # x * (64/log10of(2))
+    movapd      %xmm0,%xmm3      #xmm3 = (xouble)x
+    mulsd       .L__real_64_by_log10of2(%rip), %xmm3  #xmm3 = x * (64/ln(2)
+
+    # n = int( x * (64/log10of(2)) )
+    cvtpd2dq    %xmm3, %xmm4  #xmm4 = (int)n
+    cvtdq2pd    %xmm4, %xmm2  #xmm2 = (double)n
+
+    # r = x - n * ln(2)/64
+    # r *= ln(10)
+    mulsd       .L__real_log10of2_by_64(%rip),%xmm2 #xmm2 = n * log10of(2)/64
+    movd        %xmm4, %ecx     #ecx = n
+    subsd       %xmm2, %xmm0    #xmm0 = r
+    mulsd       .L__real_ln10(%rip),%xmm0 #xmm0 = r = r*ln10
+    movapd      %xmm0, %xmm1    #xmm1 = r
+
+    # q = r + r*r(1/2 + r*1/6)
+    movapd       .L__real_1_by_6(%rip), %xmm3 
+    mulsd       %xmm0, %xmm3 #xmm3 = 1/6 * r
+    mulsd       %xmm1, %xmm0 #xmm0 =  r  * r
+    addsd       .L__real_1_by_2(%rip), %xmm3 #xmm3 = 1/2 + (1/6 * r)
+    mulsd       %xmm3, %xmm0  #xmm0 = r*r*(1/2 + (1/6 * r))
+    addsd       %xmm1, %xmm0  #xmm0 = r+r*r*(1/2 + (1/6 * r))
+    
+    #j = n & 0x3f
+    mov         $0x3f, %rax     #rax = 0x3f
+    and         %ecx, %eax      #eax = j = n & 0x3f
+
+    # f + (f*q)
+    lea         L__two_to_jby64_table(%rip), %r10    
+    mulsd       (%r10,%rax,8), %xmm0
+    addsd       (%r10,%rax,8), %xmm0
+
+    .p2align 4
+    # m = (n - j) / 64        
+    psrad       $6,%xmm4
+    psllq       $52,%xmm4
+    paddq       %xmm0, %xmm4
+    cvtpd2ps    %xmm4, %xmm0
+    ret
+
+.p2align 4
+.L__y_is_zero:
+    pxor        %xmm1, %xmm1    #return value in xmm1,input in xmm0 before calling
+    mov         $2, %edi        #code in edi
+    #call        fname_special
+    pxor        %xmm0,%xmm0#remove this if calling fname special
+    ret         
+
+.p2align 4
+.L__y_is_inf:
+    mov         $0x7f800000,%edx
+    movd        %edx, %xmm1
+    mov         $3, %edi
+    #call        fname_special
+    movdqa     %xmm1,%xmm0#remove this if calling fname special
+    ret     
+
+.p2align 4
+.L__y_is_nan:
+    movaps %xmm0,%xmm1
+    addss  %xmm1,%xmm1
+    mov         $1, %edi
+    #call        fname_special
+    movdqa %xmm1,%xmm0  #remove this if calling fname special
+    ret       
+    
+.data
+.align 16
+.L__max_exp_arg:                 .long 0x421A209B
+.L__min_exp_arg:                 .long 0xC23369F4
+.L__real_64_by_log10of2:        .quad 0x406A934F0979A371 # 64/log10(2)
+.L__real_log10of2_by_64:        .quad 0x3F734413509F79FF # log10of2_by_64
+.L__real_ln10:                  .quad 0x40026BB1BBB55516 # ln(10)
+.L__real_1_by_6:                .quad 0x3fc5555555555555 # 1/6
+.L__real_1_by_2:                .quad 0x3fe0000000000000 # 1/2
+
+.align 16
+.type	L__two_to_jby64_table, @object
+.size	L__two_to_jby64_table, 512
+L__two_to_jby64_table:
+    .quad 0x3ff0000000000000
+    .quad 0x3ff02c9a3e778061
+    .quad 0x3ff059b0d3158574
+    .quad 0x3ff0874518759bc8
+    .quad 0x3ff0b5586cf9890f
+    .quad 0x3ff0e3ec32d3d1a2
+    .quad 0x3ff11301d0125b51
+    .quad 0x3ff1429aaea92de0
+    .quad 0x3ff172b83c7d517b
+    .quad 0x3ff1a35beb6fcb75
+    .quad 0x3ff1d4873168b9aa
+    .quad 0x3ff2063b88628cd6
+    .quad 0x3ff2387a6e756238
+    .quad 0x3ff26b4565e27cdd
+    .quad 0x3ff29e9df51fdee1
+    .quad 0x3ff2d285a6e4030b
+    .quad 0x3ff306fe0a31b715
+    .quad 0x3ff33c08b26416ff
+    .quad 0x3ff371a7373aa9cb
+    .quad 0x3ff3a7db34e59ff7
+    .quad 0x3ff3dea64c123422
+    .quad 0x3ff4160a21f72e2a
+    .quad 0x3ff44e086061892d
+    .quad 0x3ff486a2b5c13cd0
+    .quad 0x3ff4bfdad5362a27
+    .quad 0x3ff4f9b2769d2ca7
+    .quad 0x3ff5342b569d4f82
+    .quad 0x3ff56f4736b527da
+    .quad 0x3ff5ab07dd485429
+    .quad 0x3ff5e76f15ad2148
+    .quad 0x3ff6247eb03a5585
+    .quad 0x3ff6623882552225
+    .quad 0x3ff6a09e667f3bcd
+    .quad 0x3ff6dfb23c651a2f
+    .quad 0x3ff71f75e8ec5f74
+    .quad 0x3ff75feb564267c9
+    .quad 0x3ff7a11473eb0187
+    .quad 0x3ff7e2f336cf4e62
+    .quad 0x3ff82589994cce13
+    .quad 0x3ff868d99b4492ed
+    .quad 0x3ff8ace5422aa0db
+    .quad 0x3ff8f1ae99157736
+    .quad 0x3ff93737b0cdc5e5
+    .quad 0x3ff97d829fde4e50
+    .quad 0x3ff9c49182a3f090
+    .quad 0x3ffa0c667b5de565
+    .quad 0x3ffa5503b23e255d
+    .quad 0x3ffa9e6b5579fdbf
+    .quad 0x3ffae89f995ad3ad
+    .quad 0x3ffb33a2b84f15fb
+    .quad 0x3ffb7f76f2fb5e47
+    .quad 0x3ffbcc1e904bc1d2
+    .quad 0x3ffc199bdd85529c
+    .quad 0x3ffc67f12e57d14b
+    .quad 0x3ffcb720dcef9069
+    .quad 0x3ffd072d4a07897c
+    .quad 0x3ffd5818dcfba487
+    .quad 0x3ffda9e603db3285
+    .quad 0x3ffdfc97337b9b5f
+    .quad 0x3ffe502ee78b3ff6
+    .quad 0x3ffea4afa2a490da
+    .quad 0x3ffefa1bee615a27
+    .quad 0x3fff50765b6e4540
+    .quad 0x3fffa7c1819e90d8
+
+
diff --git a/src/gas/exp2.S b/src/gas/exp2.S
new file mode 100644
index 0000000..8e556d4
--- /dev/null
+++ b/src/gas/exp2.S
@@ -0,0 +1,355 @@
+
+#
+#  (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+#  This file is part of libacml_mv.
+#
+#  libacml_mv is free software; you can redistribute it and/or
+#  modify it under the terms of the GNU Lesser General Public
+#  License as published by the Free Software Foundation; either
+#  version 2.1 of the License, or (at your option) any later version.
+#
+#  libacml_mv is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  Lesser General Public License for more details.
+#
+#  You should have received a copy of the GNU Lesser General Public
+#  License along with libacml_mv.  If not, see
+#  <http://www.gnu.org/licenses/>.
+#
+#
+
+
+#include "fn_macros.h"
+#define fname FN_PROTOTYPE(exp2)
+#define fname_special _exp2_special@PLT
+#ifdef __ELF__
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.p2align 4
+.globl fname
+.type fname,@function
+fname:
+    ucomisd      .L__max_exp2_arg(%rip), %xmm0
+    ja           .L__y_is_inf
+    jp           .L__y_is_nan
+    ucomisd      .L__min_exp2_arg(%rip), %xmm0
+    jbe          .L__y_is_zero
+
+    # x * (64)
+    movapd      %xmm0,%xmm2        
+    mulsd       .L__real_64(%rip), %xmm2
+
+    # n = int( x * (64))
+    cvttpd2dq    %xmm2, %xmm1   #xmm1 = (int)n
+    cvtdq2pd    %xmm1, %xmm2   #xmm2 = (double)n
+    movd        %xmm1, %ecx
+
+    # r = x - n * 1/64  
+    #r *= ln2;      
+    mulsd    .L__one_by_64(%rip),%xmm2
+    addsd    %xmm0,%xmm2   #xmm2 = r    
+    mulsd    .L__ln_2(%rip),%xmm2    
+        
+    #j = n & 0x3f    
+    mov         $0x3f, %rax
+    and         %ecx, %eax     #eax = j
+    # m = (n - j) / 64      
+    sar         $6, %ecx       #ecx = m
+
+    # q = r + r^2*1/2 + r^3*1/6 + r^4 *1/24 + r^5*1/120 + r^6*1/720
+    # q = r + r*r*(1/2 + r*(1/6+ r*(1/24 + r*(1/120 + r*(1/720)))))
+    movapd       .L__real_1_by_720(%rip), %xmm3  #xmm3 = 1/720
+    mulsd       %xmm2, %xmm3    #xmm3 = r*1/720
+    movapd       .L__real_1_by_6(%rip), %xmm0    #xmm0 = 1/6    
+    movapd      %xmm2, %xmm1 #xmm1 = r            
+    mulsd       %xmm2, %xmm0    #xmm0 = r*1/6
+    addsd       .L__real_1_by_120(%rip), %xmm3  #xmm3 = 1/120 + (r*1/720)
+    mulsd       %xmm2, %xmm1    #xmm1 = r*r    
+    addsd       .L__real_1_by_2(%rip), %xmm0  #xmm0 = 1/2 + (r*1/6)        
+    movapd       %xmm1, %xmm4   #xmm4 = r*r
+    mulsd       %xmm1, %xmm4    #xmm4 = (r*r) * (r*r)    
+    mulsd       %xmm2, %xmm3    #xmm3 = r * (1/120 + (r*1/720))
+    mulsd       %xmm1, %xmm0    #xmm0 = (r*r)*(1/2 + (r*1/6))
+    addsd       .L__real_1_by_24(%rip), %xmm3  #xmm3 = 1/24 + (r * (1/120 + (r*1/720)))
+    addsd       %xmm2, %xmm0   #xmm0 = r + ((r*r)*(1/2 + (r*1/6)))
+    mulsd       %xmm4, %xmm3   #xmm3 = ((r*r) * (r*r)) * (1/24 + (r * (1/120 + (r*1/720))))
+    addsd       %xmm3, %xmm0   #xmm0 = r + ((r*r)*(1/2 + (r*1/6))) + ((r*r) * (r*r)) * (1/24 + (r * (1/120 + (r*1/720))))
+    
+    # (f)*(q) + f2 + f1
+    cmp         $0xfffffc02, %ecx # -1022    
+    lea         .L__two_to_jby64_table(%rip), %rdx        
+    lea         .L__two_to_jby64_tail_table(%rip), %r11       
+    lea         .L__two_to_jby64_head_table(%rip), %r10      
+    mulsd       (%rdx,%rax,8), %xmm0
+    addsd       (%r11,%rax,8), %xmm0
+    addsd       (%r10,%rax,8), %xmm0        
+
+    jle         .L__process_denormal 
+.L__process_normal:
+    shl         $52, %rcx    
+    movd        %rcx,%xmm2
+    paddq       %xmm2, %xmm0
+    ret
+
+.p2align 4
+.L__process_denormal:
+    jl          .L__process_true_denormal
+    ucomisd     .L__real_one(%rip), %xmm0
+    jae         .L__process_normal
+.L__process_true_denormal:
+    # here ( e^r < 1 and m = -1022 ) or m <= -1023
+    add         $1074, %ecx
+    mov         $1, %rax    
+    shl         %cl, %rax
+    movd         %rax, %xmm2
+    mulsd       %xmm2, %xmm0
+    ret        
+    
+.p2align 4
+.L__y_is_inf:
+    mov         $0x7ff0000000000000,%rax
+    movd       %rax, %xmm1
+    mov         $3, %edi
+    #call        fname_special
+    movdqa     %xmm1,%xmm0 #remove this if call is made    
+    ret     
+
+.p2align 4
+.L__y_is_nan:
+    movapd      %xmm0,%xmm1
+    addsd       %xmm0,%xmm1
+    mov         $1, %edi
+    #call        fname_special
+    movdqa     %xmm1,%xmm0 #remove this if call is made    
+    ret
+
+.p2align 4
+.L__y_is_zero:
+    pxor        %xmm1,%xmm1
+    mov         $2, %edi
+    #call        fname_special
+    movdqa     %xmm1,%xmm0 #remove this if call is made
+    ret      
+    
+.data
+.align 16
+.L__max_exp2_arg:            .quad 0x4090000000000000
+.L__min_exp2_arg:            .quad 0xc090c80000000000
+.L__real_64:                 .quad 0x4050000000000000    # 64
+.L__ln_2:                    .quad 0x3FE62E42FEFA39EF
+.L__one_by_64:               .quad 0xbF90000000000000
+
+.align 16
+.L__real_1_by_720:              .quad 0x3f56c16c16c16c17    # 1/720
+.L__real_1_by_120:              .quad 0x3f81111111111111    # 1/120
+.L__real_1_by_6:                .quad 0x3fc5555555555555    # 1/6
+.L__real_1_by_2:                .quad 0x3fe0000000000000    # 1/2
+.L__real_1_by_24:               .quad 0x3fa5555555555555    # 1/24
+.L__real_one:                   .quad 0x3ff0000000000000
+
+.align 16
+.L__two_to_jby64_table:
+    .quad 0x3ff0000000000000
+    .quad 0x3ff02c9a3e778061
+    .quad 0x3ff059b0d3158574
+    .quad 0x3ff0874518759bc8
+    .quad 0x3ff0b5586cf9890f
+    .quad 0x3ff0e3ec32d3d1a2
+    .quad 0x3ff11301d0125b51
+    .quad 0x3ff1429aaea92de0
+    .quad 0x3ff172b83c7d517b
+    .quad 0x3ff1a35beb6fcb75
+    .quad 0x3ff1d4873168b9aa
+    .quad 0x3ff2063b88628cd6
+    .quad 0x3ff2387a6e756238
+    .quad 0x3ff26b4565e27cdd
+    .quad 0x3ff29e9df51fdee1
+    .quad 0x3ff2d285a6e4030b
+    .quad 0x3ff306fe0a31b715
+    .quad 0x3ff33c08b26416ff
+    .quad 0x3ff371a7373aa9cb
+    .quad 0x3ff3a7db34e59ff7
+    .quad 0x3ff3dea64c123422
+    .quad 0x3ff4160a21f72e2a
+    .quad 0x3ff44e086061892d
+    .quad 0x3ff486a2b5c13cd0
+    .quad 0x3ff4bfdad5362a27
+    .quad 0x3ff4f9b2769d2ca7
+    .quad 0x3ff5342b569d4f82
+    .quad 0x3ff56f4736b527da
+    .quad 0x3ff5ab07dd485429
+    .quad 0x3ff5e76f15ad2148
+    .quad 0x3ff6247eb03a5585
+    .quad 0x3ff6623882552225
+    .quad 0x3ff6a09e667f3bcd
+    .quad 0x3ff6dfb23c651a2f
+    .quad 0x3ff71f75e8ec5f74
+    .quad 0x3ff75feb564267c9
+    .quad 0x3ff7a11473eb0187
+    .quad 0x3ff7e2f336cf4e62
+    .quad 0x3ff82589994cce13
+    .quad 0x3ff868d99b4492ed
+    .quad 0x3ff8ace5422aa0db
+    .quad 0x3ff8f1ae99157736
+    .quad 0x3ff93737b0cdc5e5
+    .quad 0x3ff97d829fde4e50
+    .quad 0x3ff9c49182a3f090
+    .quad 0x3ffa0c667b5de565
+    .quad 0x3ffa5503b23e255d
+    .quad 0x3ffa9e6b5579fdbf
+    .quad 0x3ffae89f995ad3ad
+    .quad 0x3ffb33a2b84f15fb
+    .quad 0x3ffb7f76f2fb5e47
+    .quad 0x3ffbcc1e904bc1d2
+    .quad 0x3ffc199bdd85529c
+    .quad 0x3ffc67f12e57d14b
+    .quad 0x3ffcb720dcef9069
+    .quad 0x3ffd072d4a07897c
+    .quad 0x3ffd5818dcfba487
+    .quad 0x3ffda9e603db3285
+    .quad 0x3ffdfc97337b9b5f
+    .quad 0x3ffe502ee78b3ff6
+    .quad 0x3ffea4afa2a490da
+    .quad 0x3ffefa1bee615a27
+    .quad 0x3fff50765b6e4540
+    .quad 0x3fffa7c1819e90d8
+
+.align 16
+.L__two_to_jby64_head_table:
+    .quad 0x3ff0000000000000
+    .quad 0x3ff02c9a30000000
+    .quad 0x3ff059b0d0000000
+    .quad 0x3ff0874510000000
+    .quad 0x3ff0b55860000000
+    .quad 0x3ff0e3ec30000000
+    .quad 0x3ff11301d0000000
+    .quad 0x3ff1429aa0000000
+    .quad 0x3ff172b830000000
+    .quad 0x3ff1a35be0000000
+    .quad 0x3ff1d48730000000
+    .quad 0x3ff2063b80000000
+    .quad 0x3ff2387a60000000
+    .quad 0x3ff26b4560000000
+    .quad 0x3ff29e9df0000000
+    .quad 0x3ff2d285a0000000
+    .quad 0x3ff306fe00000000
+    .quad 0x3ff33c08b0000000
+    .quad 0x3ff371a730000000
+    .quad 0x3ff3a7db30000000
+    .quad 0x3ff3dea640000000
+    .quad 0x3ff4160a20000000
+    .quad 0x3ff44e0860000000
+    .quad 0x3ff486a2b0000000
+    .quad 0x3ff4bfdad0000000
+    .quad 0x3ff4f9b270000000
+    .quad 0x3ff5342b50000000
+    .quad 0x3ff56f4730000000
+    .quad 0x3ff5ab07d0000000
+    .quad 0x3ff5e76f10000000
+    .quad 0x3ff6247eb0000000
+    .quad 0x3ff6623880000000
+    .quad 0x3ff6a09e60000000
+    .quad 0x3ff6dfb230000000
+    .quad 0x3ff71f75e0000000
+    .quad 0x3ff75feb50000000
+    .quad 0x3ff7a11470000000
+    .quad 0x3ff7e2f330000000
+    .quad 0x3ff8258990000000
+    .quad 0x3ff868d990000000
+    .quad 0x3ff8ace540000000
+    .quad 0x3ff8f1ae90000000
+    .quad 0x3ff93737b0000000
+    .quad 0x3ff97d8290000000
+    .quad 0x3ff9c49180000000
+    .quad 0x3ffa0c6670000000
+    .quad 0x3ffa5503b0000000
+    .quad 0x3ffa9e6b50000000
+    .quad 0x3ffae89f90000000
+    .quad 0x3ffb33a2b0000000
+    .quad 0x3ffb7f76f0000000
+    .quad 0x3ffbcc1e90000000
+    .quad 0x3ffc199bd0000000
+    .quad 0x3ffc67f120000000
+    .quad 0x3ffcb720d0000000
+    .quad 0x3ffd072d40000000
+    .quad 0x3ffd5818d0000000
+    .quad 0x3ffda9e600000000
+    .quad 0x3ffdfc9730000000
+    .quad 0x3ffe502ee0000000
+    .quad 0x3ffea4afa0000000
+    .quad 0x3ffefa1be0000000
+    .quad 0x3fff507650000000
+    .quad 0x3fffa7c180000000
+
+.align 16
+.L__two_to_jby64_tail_table:
+    .quad 0x0000000000000000
+    .quad 0x3e6cef00c1dcdef9
+    .quad 0x3e48ac2ba1d73e2a
+    .quad 0x3e60eb37901186be
+    .quad 0x3e69f3121ec53172
+    .quad 0x3e469e8d10103a17
+    .quad 0x3df25b50a4ebbf1a
+    .quad 0x3e6d525bbf668203
+    .quad 0x3e68faa2f5b9bef9
+    .quad 0x3e66df96ea796d31
+    .quad 0x3e368b9aa7805b80
+    .quad 0x3e60c519ac771dd6
+    .quad 0x3e6ceac470cd83f5
+    .quad 0x3e5789f37495e99c
+    .quad 0x3e547f7b84b09745
+    .quad 0x3e5b900c2d002475
+    .quad 0x3e64636e2a5bd1ab
+    .quad 0x3e4320b7fa64e430
+    .quad 0x3e5ceaa72a9c5154
+    .quad 0x3e53967fdba86f24
+    .quad 0x3e682468446b6824
+    .quad 0x3e3f72e29f84325b
+    .quad 0x3e18624b40c4dbd0
+    .quad 0x3e5704f3404f068e
+    .quad 0x3e54d8a89c750e5e
+    .quad 0x3e5a74b29ab4cf62
+    .quad 0x3e5a753e077c2a0f
+    .quad 0x3e5ad49f699bb2c0
+    .quad 0x3e6a90a852b19260
+    .quad 0x3e56b48521ba6f93
+    .quad 0x3e0d2ac258f87d03
+    .quad 0x3e42a91124893ecf
+    .quad 0x3e59fcef32422cbe
+    .quad 0x3e68ca345de441c5
+    .quad 0x3e61d8bee7ba46e1
+    .quad 0x3e59099f22fdba6a
+    .quad 0x3e4f580c36bea881
+    .quad 0x3e5b3d398841740a
+    .quad 0x3e62999c25159f11
+    .quad 0x3e668925d901c83b
+    .quad 0x3e415506dadd3e2a
+    .quad 0x3e622aee6c57304e
+    .quad 0x3e29b8bc9e8a0387
+    .quad 0x3e6fbc9c9f173d24
+    .quad 0x3e451f8480e3e235
+    .quad 0x3e66bbcac96535b5
+    .quad 0x3e41f12ae45a1224
+    .quad 0x3e55e7f6fd0fac90
+    .quad 0x3e62b5a75abd0e69
+    .quad 0x3e609e2bf5ed7fa1
+    .quad 0x3e47daf237553d84
+    .quad 0x3e12f074891ee83d
+    .quad 0x3e6b0aa538444196
+    .quad 0x3e6cafa29694426f
+    .quad 0x3e69df20d22a0797
+    .quad 0x3e640f12f71a1e45
+    .quad 0x3e69f7490e4bb40b
+    .quad 0x3e4ed9942b84600d
+    .quad 0x3e4bdcdaf5cb4656
+    .quad 0x3e5e2cffd89cf44c
+    .quad 0x3e452486cc2c7b9d
+    .quad 0x3e6cc2b44eee3fa4
+    .quad 0x3e66dc8a80ce9f09
+    .quad 0x3e39e90d82e90a7e
+
+
diff --git a/src/gas/exp2f.S b/src/gas/exp2f.S
new file mode 100644
index 0000000..78c50e0
--- /dev/null
+++ b/src/gas/exp2f.S
@@ -0,0 +1,193 @@
+
+#
+#  (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+#  This file is part of libacml_mv.
+#
+#  libacml_mv is free software; you can redistribute it and/or
+#  modify it under the terms of the GNU Lesser General Public
+#  License as published by the Free Software Foundation; either
+#  version 2.1 of the License, or (at your option) any later version.
+#
+#  libacml_mv is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  Lesser General Public License for more details.
+#
+#  You should have received a copy of the GNU Lesser General Public
+#  License along with libacml_mv.  If not, see
+#  <http://www.gnu.org/licenses/>.
+#
+#
+
+
+#include "fn_macros.h"
+#define fname FN_PROTOTYPE(exp2f)
+#define fname_special _exp2f_special@PLT
+
+#ifdef __ELF__
+.section .note.GNU-stack,"",@progbits
+#endif
+
+.text
+.p2align 4
+.globl fname
+.type fname,@function
+fname:
+    ucomiss .L__max_exp2_arg(%rip), %xmm0
+    ja .L__y_is_inf
+    jp .L__y_is_nan
+    ucomiss .L__min_exp2_arg(%rip), %xmm0
+    jb .L__y_is_zero
+
+    cvtps2pd     %xmm0, %xmm0    #xmm0 = (double)x
+
+    # x * (64)
+    movapd      %xmm0,%xmm3      #xmm3 = (double)x
+    #mulsd       .L__sixtyfour(%rip), %xmm3  #xmm3 = x * (64)
+    paddq       .L__sixtyfour(%rip), %xmm3  #xmm3 = x * (64)
+
+    # n = int( x * (64)
+    cvtpd2dq    %xmm3, %xmm4  #xmm4 = (int)n
+    cvtdq2pd    %xmm4, %xmm2  #xmm2 = (double)n
+
+    # r = x - n * 1/64
+    # r *= ln(2)
+    mulsd       .L__one_by_64(%rip),%xmm2 #xmm2 = n * 1/64
+    movd        %xmm4, %ecx     #ecx = n
+    subsd       %xmm2, %xmm0    #xmm0 = r
+    mulsd       .L__ln2(%rip),%xmm0 #xmm0 = r = r*ln(2)    
+    movapd      %xmm0, %xmm1    #xmm1 = r
+
+    # q
+    movsd       .L__real_1_by_6(%rip), %xmm3 
+    mulsd       %xmm0, %xmm3 #xmm3 = 1/6 * r
+    mulsd       %xmm1, %xmm0 #xmm0 =  r  * r
+    addsd       .L__real_1_by_2(%rip), %xmm3 #xmm3 = 1/2 + (1/6 * r)
+    mulsd       %xmm3, %xmm0  #xmm0 = r*r*(1/2 + (1/6 * r))
+    addsd       %xmm1, %xmm0  #xmm0 = r+r*r*(1/2 + (1/6 * r))
+    
+    #j = n & 0x3f
+    mov         $0x3f, %rax     #rax = 0x3f
+    and         %ecx, %eax      #eax = j = n & 0x3f
+
+    # f + (f*q)
+    lea         L__two_to_jby64_table(%rip), %r10    
+    mulsd       (%r10,%rax,8), %xmm0
+    addsd       (%r10,%rax,8), %xmm0
+
+    .p2align 4
+    # m = (n - j) / 64        
+    psrad       $6,%xmm4
+    psllq       $52,%xmm4
+    paddq       %xmm0, %xmm4
+    cvtpd2ps    %xmm4, %xmm0
+    ret
+
+.p2align 4
+.L__y_is_zero:
+    pxor        %xmm1, %xmm1    #return value in xmm1,input in xmm0 before calling
+    mov         $2, %edi        #code in edi
+    #call        fname_special
+    pxor        %xmm0,%xmm0#remove this if calling fname special
+    ret         
+
+.p2align 4
+.L__y_is_inf:
+    mov         $0x7f800000,%edx
+    movd        %edx, %xmm1
+    mov         $3, %edi
+    #call        fname_special
+    movdqa     %xmm1,%xmm0#remove this if calling fname special
+    ret     
+
+.p2align 4
+.L__y_is_nan:
+    movaps %xmm0,%xmm1
+    addss  %xmm1,%xmm1
+    mov         $1, %edi
+    #call        fname_special
+    movdqa %xmm1,%xmm0  #remove this if calling fname special
+    ret      
+    
+.data
+.align 16
+.L__max_exp2_arg:                 .long 0x43000000
+.L__min_exp2_arg:                 .long 0xc3150000
+.align 16
+.L__sixtyfour:                  .quad 0x0060000000000000 # 64
+.L__one_by_64:                  .quad 0x3F90000000000000 # 1/64
+.L__ln2:                        .quad 0x3FE62E42FEFA39EF # ln(2)
+.L__real_1_by_6:                .quad 0x3fc5555555555555 # 1/6
+.L__real_1_by_2:                .quad 0x3fe0000000000000 # 1/2
+
+.align 16
+.type	L__two_to_jby64_table, @object
+.size	L__two_to_jby64_table, 512
+L__two_to_jby64_table:
+    .quad 0x3ff0000000000000
+    .quad 0x3ff02c9a3e778061
+    .quad 0x3ff059b0d3158574
+    .quad 0x3ff0874518759bc8
+    .quad 0x3ff0b5586cf9890f
+    .quad 0x3ff0e3ec32d3d1a2
+    .quad 0x3ff11301d0125b51
+    .quad 0x3ff1429aaea92de0
+    .quad 0x3ff172b83c7d517b
+    .quad 0x3ff1a35beb6fcb75
+    .quad 0x3ff1d4873168b9aa
+    .quad 0x3ff2063b88628cd6
+    .quad 0x3ff2387a6e756238
+    .quad 0x3ff26b4565e27cdd
+    .quad 0x3ff29e9df51fdee1
+    .quad 0x3ff2d285a6e4030b
+    .quad 0x3ff306fe0a31b715
+    .quad 0x3ff33c08b26416ff
+    .quad 0x3ff371a7373aa9cb
+    .quad 0x3ff3a7db34e59ff7
+    .quad 0x3ff3dea64c123422
+    .quad 0x3ff4160a21f72e2a
+    .quad 0x3ff44e086061892d
+    .quad 0x3ff486a2b5c13cd0
+    .quad 0x3ff4bfdad5362a27
+    .quad 0x3ff4f9b2769d2ca7
+    .quad 0x3ff5342b569d4f82
+    .quad 0x3ff56f4736b527da
+    .quad 0x3ff5ab07dd485429
+    .quad 0x3ff5e76f15ad2148
+    .quad 0x3ff6247eb03a5585
+    .quad 0x3ff6623882552225
+    .quad 0x3ff6a09e667f3bcd
+    .quad 0x3ff6dfb23c651a2f
+    .quad 0x3ff71f75e8ec5f74
+    .quad 0x3ff75feb564267c9
+    .quad 0x3ff7a11473eb0187
+    .quad 0x3ff7e2f336cf4e62
+    .quad 0x3ff82589994cce13
+    .quad 0x3ff868d99b4492ed
+    .quad 0x3ff8ace5422aa0db
+    .quad 0x3ff8f1ae99157736
+    .quad 0x3ff93737b0cdc5e5
+    .quad 0x3ff97d829fde4e50
+    .quad 0x3ff9c49182a3f090
+    .quad 0x3ffa0c667b5de565
+    .quad 0x3ffa5503b23e255d
+    .quad 0x3ffa9e6b5579fdbf
+    .quad 0x3ffae89f995ad3ad
+    .quad 0x3ffb33a2b84f15fb
+    .quad 0x3ffb7f76f2fb5e47
+    .quad 0x3ffbcc1e904bc1d2
+    .quad 0x3ffc199bdd85529c
+    .quad 0x3ffc67f12e57d14b
+    .quad 0x3ffcb720dcef9069
+    .quad 0x3ffd072d4a07897c
+    .quad 0x3ffd5818dcfba487
+    .quad 0x3ffda9e603db3285
+    .quad 0x3ffdfc97337b9b5f
+    .quad 0x3ffe502ee78b3ff6
+    .quad 0x3ffea4afa2a490da
+    .quad 0x3ffefa1bee615a27
+    .quad 0x3fff50765b6e4540
+    .quad 0x3fffa7c1819e90d8
+
+
diff --git a/src/gas/expf.S b/src/gas/expf.S
new file mode 100644
index 0000000..cefa608
--- /dev/null
+++ b/src/gas/expf.S
@@ -0,0 +1,201 @@
+#ifdef __ELF__
+.section .note.GNU-stack,"",@progbits
+#endif
+
+#ifdef __x86_64__
+#
+#  (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+#  This file is part of libacml_mv.
+#
+#  libacml_mv is free software; you can redistribute it and/or
+#  modify it under the terms of the GNU Lesser General Public
+#  License as published by the Free Software Foundation; either
+#  version 2.1 of the License, or (at your option) any later version.
+#
+#  libacml_mv is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  Lesser General Public License for more details.
+#
+#  You should have received a copy of the GNU Lesser General Public
+#  License along with libacml_mv.  If not, see
+#  <http://www.gnu.org/licenses/>.
+#
+#
+
+
+#
+# expf.S
+#
+# An implementation of the expf libm function.
+#
+# Prototype:
+#
+#     float expf(float x);
+#
+
+#
+#   Algorithm:
+#       Similar to one presnted in exp.S
+#
+
+#include "fn_macros.h"
+#define fname FN_PROTOTYPE(expf)
+#define fname_special _expf_special@PLT
+
+.text
+.p2align 4
+.globl fname
+.type fname,@function
+fname:
+    ucomiss .L__max_exp_arg(%rip), %xmm0
+    ja .L__y_is_inf
+    jp .L__y_is_nan
+    ucomiss .L__min_exp_arg(%rip), %xmm0
+    jb .L__y_is_zero
+
+    cvtps2pd     %xmm0, %xmm0    #xmm0 = (double)x
+
+    # x * (64/ln(2))
+    movapd      %xmm0,%xmm3      #xmm3 = (xouble)x
+    mulsd       .L__real_64_by_log2(%rip), %xmm3  #xmm3 = x * (64/ln(2)
+
+    # n = int( x * (64/ln(2)) )
+    cvtpd2dq    %xmm3, %xmm4  #xmm4 = (int)n
+    cvtdq2pd    %xmm4, %xmm2  #xmm2 = (double)n
+
+    # r = x - n * ln(2)/64
+    mulsd       .L__real_log2_by_64(%rip),%xmm2 #xmm2 = n * ln(2)/64
+    movd        %xmm4, %ecx     #ecx = n
+    subsd       %xmm2, %xmm0    #xmm0 = r
+    movapd      %xmm0, %xmm1    #xmm1 = r
+
+    # q
+    movsd       .L__real_1_by_6(%rip), %xmm3 
+    mulsd       %xmm0, %xmm3 #xmm3 = 1/6 * r
+    mulsd       %xmm1, %xmm0 #xmm0 =  r  * r
+    addsd       .L__real_1_by_2(%rip), %xmm3 #xmm3 = 1/2 + (1/6 * r)
+    mulsd       %xmm3, %xmm0  #xmm0 = r*r*(1/2 + (1/6 * r))
+    addsd       %xmm1, %xmm0  #xmm0 = r+r*r*(1/2 + (1/6 * r))
+    
+    #j = n & 0x3f
+    mov         $0x3f, %rax     #rax = 0x3f
+    and         %ecx, %eax      #eax = j = n & 0x3f
+    # m = (n - j) / 64    
+    sar         $6, %ecx        #ecx = m
+    shl         $52, %rcx
+
+    # (f)*(1+q)
+    lea         L__two_to_jby64_table(%rip), %r10    
+    movsd       (%r10,%rax,8), %xmm2
+    mulsd       %xmm2, %xmm0
+    addsd       %xmm2, %xmm0
+
+    movd        %rcx, %xmm1
+    paddq       %xmm0, %xmm1
+    cvtpd2ps    %xmm1, %xmm0
+    ret
+
+.p2align 4
+.L__y_is_zero:
+
+    pxor        %xmm1, %xmm1    #return value in xmm1,input in xmm0 before calling
+    mov         $2, %edi        #code in edi
+    jmp         fname_special
+
+.p2align 4
+.L__y_is_inf:
+
+    mov         $0x7f800000,%edx
+    movd        %edx, %xmm1
+    mov         $3, %edi
+    jmp         fname_special
+
+.p2align 4
+.L__y_is_nan:
+    movaps %xmm0,%xmm1
+    addss  %xmm1,%xmm1
+    mov         $1, %edi
+    jmp         fname_special
+    
+.data
+.align 16
+.L__max_exp_arg:                 .long 0x42B17218
+.L__min_exp_arg:                 .long 0xC2CE8ED0
+.L__real_64_by_log2:            .quad 0x40571547652b82fe # 64/ln(2)
+.L__real_log2_by_64:            .quad 0x3f862e42fefa39ef # log2_by_64
+.L__real_1_by_6:                .quad 0x3fc5555555555555 # 1/6
+.L__real_1_by_2:                .quad 0x3fe0000000000000 # 1/2
+
+.align 16
+.type	L__two_to_jby64_table, @object
+.size	L__two_to_jby64_table, 512
+L__two_to_jby64_table:
+    .quad 0x3ff0000000000000
+    .quad 0x3ff02c9a3e778061
+    .quad 0x3ff059b0d3158574
+    .quad 0x3ff0874518759bc8
+    .quad 0x3ff0b5586cf9890f
+    .quad 0x3ff0e3ec32d3d1a2
+    .quad 0x3ff11301d0125b51
+    .quad 0x3ff1429aaea92de0
+    .quad 0x3ff172b83c7d517b
+    .quad 0x3ff1a35beb6fcb75
+    .quad 0x3ff1d4873168b9aa
+    .quad 0x3ff2063b88628cd6
+    .quad 0x3ff2387a6e756238
+    .quad 0x3ff26b4565e27cdd
+    .quad 0x3ff29e9df51fdee1
+    .quad 0x3ff2d285a6e4030b
+    .quad 0x3ff306fe0a31b715
+    .quad 0x3ff33c08b26416ff
+    .quad 0x3ff371a7373aa9cb
+    .quad 0x3ff3a7db34e59ff7
+    .quad 0x3ff3dea64c123422
+    .quad 0x3ff4160a21f72e2a
+    .quad 0x3ff44e086061892d
+    .quad 0x3ff486a2b5c13cd0
+    .quad 0x3ff4bfdad5362a27
+    .quad 0x3ff4f9b2769d2ca7
+    .quad 0x3ff5342b569d4f82
+    .quad 0x3ff56f4736b527da
+    .quad 0x3ff5ab07dd485429
+    .quad 0x3ff5e76f15ad2148
+    .quad 0x3ff6247eb03a5585
+    .quad 0x3ff6623882552225
+    .quad 0x3ff6a09e667f3bcd
+    .quad 0x3ff6dfb23c651a2f
+    .quad 0x3ff71f75e8ec5f74
+    .quad 0x3ff75feb564267c9
+    .quad 0x3ff7a11473eb0187
+    .quad 0x3ff7e2f336cf4e62
+    .quad 0x3ff82589994cce13
+    .quad 0x3ff868d99b4492ed
+    .quad 0x3ff8ace5422aa0db
+    .quad 0x3ff8f1ae99157736
+    .quad 0x3ff93737b0cdc5e5
+    .quad 0x3ff97d829fde4e50
+    .quad 0x3ff9c49182a3f090
+    .quad 0x3ffa0c667b5de565
+    .quad 0x3ffa5503b23e255d
+    .quad 0x3ffa9e6b5579fdbf
+    .quad 0x3ffae89f995ad3ad
+    .quad 0x3ffb33a2b84f15fb
+    .quad 0x3ffb7f76f2fb5e47
+    .quad 0x3ffbcc1e904bc1d2
+    .quad 0x3ffc199bdd85529c
+    .quad 0x3ffc67f12e57d14b
+    .quad 0x3ffcb720dcef9069
+    .quad 0x3ffd072d4a07897c
+    .quad 0x3ffd5818dcfba487
+    .quad 0x3ffda9e603db3285
+    .quad 0x3ffdfc97337b9b5f
+    .quad 0x3ffe502ee78b3ff6
+    .quad 0x3ffea4afa2a490da
+    .quad 0x3ffefa1bee615a27
+    .quad 0x3fff50765b6e4540
+    .quad 0x3fffa7c1819e90d8
+
+
+#endif
diff --git a/src/gas/expm1.S b/src/gas/expm1.S
new file mode 100644
index 0000000..dff043c
--- /dev/null
+++ b/src/gas/expm1.S
@@ -0,0 +1,359 @@
+
+#
+#  (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+#  This file is part of libacml_mv.
+#
+#  libacml_mv is free software; you can redistribute it and/or
+#  modify it under the terms of the GNU Lesser General Public
+#  License as published by the Free Software Foundation; either
+#  version 2.1 of the License, or (at your option) any later version.
+#
+#  libacml_mv is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  Lesser General Public License for more details.
+#
+#  You should have received a copy of the GNU Lesser General Public
+#  License along with libacml_mv.  If not, see
+#  <http://www.gnu.org/licenses/>.
+#
+#
+
+
+#include "fn_macros.h"
+#define fname FN_PROTOTYPE(expm1)
+
+#ifdef __ELF__
+    .section .note.GNU-stack,"",@progbits
+#endif
+
+	.text
+	.p2align 4
+.globl fname
+	.type	fname, @function
+		
+fname:
+
+    ucomisd .L__max_expm1_arg(%rip),%xmm0  #check if(x > 709.8)
+    ja .L__Max_Arg
+    jp .L__Max_Arg
+    ucomisd .L__min_expm1_arg(%rip),%xmm0  #if(x < -37.42994775023704)
+    jb .L__Min_Arg
+    ucomisd .L__log_OneMinus_OneByFour(%rip),%xmm0
+    jbe .L__Normal_Flow
+    ucomisd .L__log_OnePlus_OneByFour(%rip),%xmm0
+    jb .L__Small_Arg 
+    
+    .p2align 4
+.L__Normal_Flow:
+    movapd %xmm0,%xmm1  #xmm1 = x
+    mulsd .L__thirtyTwo_by_ln2(%rip),%xmm1   #xmm1 = x*thirtyTwo_by_ln2
+    ucomisd .L__zero(%rip),%xmm1             #check if temp < 0.0
+    jae .L__Add_Point_Five
+    subsd .L__point_Five(%rip),%xmm1
+    jmp .L__next     
+.L__Add_Point_Five:
+    addsd .L__point_Five(%rip),%xmm1         #xmm1 = temp +/- 0.5
+.L__next:
+    cvttpd2dq %xmm1,%xmm2              #xmm2 = (int)n
+    cvtdq2pd  %xmm2,%xmm1              #xmm1 = (double)n  
+    movapd %xmm2,%xmm3                 #xmm3 = (int)n
+    psrad $5,%xmm2                     #xmm2 = m
+    pslld $27,%xmm3            
+    psrld $27,%xmm3                    #xmm3 = j    
+    movd %xmm3,%edx                    #edx = j
+    movd %xmm2,%ecx                    #ecx = m
+    
+    movlhps %xmm1,%xmm1                #xmm1 = n,n
+    mulpd .L__Ln2By32_MinusTrailLead(%rip),%xmm1 
+    movapd %xmm0,%xmm2
+    subsd %xmm1,%xmm2                  #xmm2 = r1
+    psrldq $8,%xmm1                    #xmm1 = r2
+    movapd %xmm2,%xmm3                 #xmm3 = r1    
+    addsd %xmm1,%xmm3                  #xmm3 = r
+    #q = r*(r*(A1.f64 + r*(A2.f64 + r*(A3.f64 + r*(A4.f64 + r*(A5.f64))))));    
+    movapd %xmm3,%xmm4
+    mulsd .L__A5(%rip),%xmm4
+    addsd .L__A4(%rip),%xmm4
+    mulsd %xmm3,%xmm4
+    addsd .L__A3(%rip),%xmm4
+    mulsd %xmm3,%xmm4
+    addsd .L__A2(%rip),%xmm4
+    mulsd %xmm3,%xmm4
+    addsd .L__A1(%rip),%xmm4
+    mulsd %xmm3,%xmm4
+    mulsd %xmm4,%xmm3                #xmm3 = q
+    
+    shl $4,%edx
+	lea  S_lead_and_trail_table(%rip),%rax
+    movdqa  (%rax,%rdx,1),%xmm5       #xmm5 = S_T,S_L
+    
+    #p = (r2+q) + r1;
+    addsd %xmm3,%xmm1
+    addsd %xmm1,%xmm2                #xmm2 = p
+    
+    #s = S_L.f64 + S_T.f64;    
+    movhlps %xmm5,%xmm4              #xmm4 = S_T
+    movapd %xmm4,%xmm3               #xmm3 = S_T
+    addsd %xmm5,%xmm3                #xmm3 = s
+    
+    cmp $52,%ecx        #check m > 52
+    jg .L__M_Above_52
+    cmp $-7,%ecx        #check if m < -7
+    jl .L__M_Below_Minus7
+    #(-8 < m) && (m < 53)
+    movapd %xmm2,%xmm3               #xmm3 = p
+    addsd .L__One(%rip),%xmm3  #xmm3 = 1+p
+    mulsd %xmm4,%xmm3          #xmm3 = S_T.f64 *(1+p)
+    mulsd %xmm5,%xmm2                #xmm2 = S_L*p
+    addsd %xmm3,%xmm2 #xmm2 = (S_L.f64*p+ S_T.f64 *(1+p))
+    mov $1023,%edx
+    sub %ecx,%edx                    #edx = twopmm
+    shl $52,%rdx
+    movd %rdx,%xmm1            #xmm1 = twopmm
+    subsd %xmm1,%xmm5    #xmm5 = S_L.f64 - twopmm.f64
+    addsd %xmm5,%xmm2
+    shl $52,%rcx
+    movd %rcx,%xmm0      #xmm0 = twopm
+    paddq %xmm2,%xmm0   #xmm0 = twopm *(xmm2)
+    ret   
+    
+    .p2align 4  
+.L__M_Above_52:
+    cmp $1024,%ecx #check if m = 1024
+    je .L__M_Equals_1024
+    #twopm.f64 * (S_L.f64 + (s*p+(S_T.f64 - twopmm.f64)));// 2^-m should not be calculated if m>105
+    mov $1023,%edx
+    sub %ecx,%edx                    #edx = twopmm
+    shl $52,%rdx
+    movd %rdx,%xmm1            #xmm1 = twopmm
+    subsd %xmm1,%xmm4  #xmm4 = S_T - twopmm
+    mulsd %xmm3,%xmm2  #xmm2 = s*p
+    addsd %xmm4,%xmm2 
+    addsd %xmm5,%xmm2
+    shl $52,%rcx
+    movd %rcx,%xmm0      #xmm0 = twopm
+    paddq %xmm2,%xmm0
+    ret
+    
+    .p2align 4    
+.L__M_Below_Minus7:
+    #twopm.f64 * (S_L.f64 + (s*p + S_T.f64)) - 1;
+    mulsd %xmm3,%xmm2    #xmm2 = s*p
+    addsd %xmm4,%xmm2   #xmm2 = (s*p + S_T.f64)
+    addsd %xmm5,%xmm2   #xmm2 = (S_L.f64 + (s*p + S_T.f64))
+    shl $52,%rcx
+    movd %rcx,%xmm0      #xmm0 = twopm
+    paddq %xmm2,%xmm0   #xmm0 = twopm *(xmm2)
+    subsd .L__One(%rip),%xmm0    
+    ret
+    
+    .p2align 4
+.L__M_Equals_1024:
+    mov $0x4000000000000000,%rax #1024 at exponent
+    mulsd %xmm3,%xmm2 #xmm2 = s*p
+    addsd %xmm4,%xmm2 #xmm2 = (s*p) + S_T
+    addsd %xmm5,%xmm2 #xmm2 = S_L + ((s*p) + S_T)
+    movd %rax,%xmm1 #xmm1 = twopm
+    paddq %xmm2,%xmm1
+    movd %xmm1,%rax
+    mov $0x7FF0000000000000,%rcx
+    and %rcx,%rax
+    cmp %rcx,%rax #check if we reached inf
+    je .L__return_Inf
+    movapd %xmm1,%xmm0                   
+    ret
+    
+    .p2align 4
+.L__Small_Arg:
+    movapd %xmm0,%xmm1
+    psllq $1,%xmm1
+    psrlq $1,%xmm1            #xmm1 = abs(x)
+    ucomisd .L__Five_Pont_FiveEMinus17(%rip),%xmm1
+    jb .L__VeryTinyArg
+    mov $0x01E0000000000000,%rax #30 in exponents place
+    #u = (twop30.f64 * x + x) - twop30.f64 * x;    
+    movd %rax,%xmm1
+    paddq %xmm0,%xmm1  #xmm1 = twop30.f64 * x
+    movapd %xmm1,%xmm2
+    addsd %xmm0,%xmm2 #xmm2 = (twop30.f64 * x + x)
+    subsd %xmm1,%xmm2 #xmm2 = u
+    movapd %xmm0,%xmm1
+    subsd %xmm2,%xmm1 #xmm1 = v = x-u
+    movapd %xmm2,%xmm3 #xmm3 = u
+    mulsd %xmm2,%xmm3 #xmm3 = u*u
+    mulsd .L__point_Five(%rip),%xmm3 #xmm3 = y = u*u*0.5
+    #z = v * (x + u) * 0.5;
+    movapd %xmm0,%xmm4
+    addsd %xmm2,%xmm4
+    mulsd %xmm1,%xmm4
+    mulsd .L__point_Five(%rip),%xmm4 #xmm4 = z   
+    
+    #q = x*x*x*(A1.f64 + x*(A2.f64 + x*(A3.f64 + x*(A4.f64 + x*(A5.f64 + x*(A6.f64 + x*(A7.f64 + x*(A8.f64 + x*(A9.f64)))))))));
+    movapd %xmm0,%xmm5
+    mulsd .L__B9(%rip),%xmm5
+    addsd .L__B8(%rip),%xmm5
+    mulsd %xmm0,%xmm5
+    addsd .L__B7(%rip),%xmm5
+    mulsd %xmm0,%xmm5
+    addsd .L__B6(%rip),%xmm5
+    mulsd %xmm0,%xmm5           
+    addsd .L__B5(%rip),%xmm5
+    mulsd %xmm0,%xmm5    
+    addsd .L__B4(%rip),%xmm5
+    mulsd %xmm0,%xmm5    
+    addsd .L__B3(%rip),%xmm5
+    mulsd %xmm0,%xmm5    
+    addsd .L__B2(%rip),%xmm5
+    mulsd %xmm0,%xmm5   
+    addsd .L__B1(%rip),%xmm5
+    mulsd %xmm0,%xmm5  
+    mulsd %xmm0,%xmm5  
+    mulsd %xmm0,%xmm5   #xmm5 = q
+    
+    ucomisd .L__TwopM7(%rip),%xmm3    
+    jb .L__returnNext
+    addsd %xmm4,%xmm1  #xmm1 = v+z
+    addsd %xmm5,%xmm1  #xmm1 = q+(v+z)
+    addsd %xmm3,%xmm2  #xmm2 = u+y
+    addsd %xmm2,%xmm1
+    movapd %xmm1,%xmm0
+    ret    
+    .p2align 4
+.L__returnNext:
+    addsd %xmm5,%xmm4  #xmm4 = q +z
+    addsd %xmm4,%xmm3  #xmm3 = y+(q+z)
+    addsd %xmm3,%xmm0    
+    ret
+    
+    .p2align 4  
+.L__VeryTinyArg:
+    #(twop100.f64 * x + xabs.f64) * twopm100.f64);
+    mov $0x0640000000000000,%rax #100 at exponent's place
+    movd %rax,%xmm2
+    paddq %xmm2,%xmm0
+    addsd %xmm1,%xmm0
+    psubq %xmm2,%xmm0
+    ret    
+      
+    
+    .p2align 4
+.L__Max_Arg:
+   movd %xmm0,%rcx
+   mov $0x7ff0000000000000,%rax
+   cmp %rax,%rcx                        #x is either Nan or Inf
+   jb .L__return_Inf
+   mov $0x000fffffffffffff,%rdx         #check if x is Nan
+   and %rdx,%rcx
+   jne .L__Nan
+.L__return_Inf:
+   movd %rax,%xmm0
+   #call error_handler  
+   ret
+   .p2align 4 
+.L__Nan:
+    addsd   %xmm0,%xmm0
+    ret      
+   ret
+    
+    .p2align 4  
+.L__Min_Arg:
+    mov $0xBFF0000000000000,%rax   #return -1
+    #call error handler
+    movd %rax,%xmm0
+    ret      
+    
+.data
+.align 16
+.L__max_expm1_arg:
+    .quad 0x40862E6666666666    
+.L__min_expm1_arg:
+    .quad 0xC042B708872320E1
+.L__log_OneMinus_OneByFour:
+    .quad 0xBFD269621134DB93
+.L__log_OnePlus_OneByFour:
+    .quad 0x3FCC8FF7C79A9A22
+.L__thirtyTwo_by_ln2:    
+    .quad 0x40471547652B82FE
+.L__zero:
+    .quad 0x0000000000000000    
+.L__point_Five:
+    .quad 0x3FE0000000000000
+    
+.align 16    
+.L__Ln2By32_MinusTrailLead:
+    .octa 0xBD8473DE6AF278ED3F962E42FEF00000 
+.L__A5:
+    .quad 0x3F56C1728D739765
+.L__A4:
+    .quad 0x3F811115B7AA905E
+.L__A3:
+    .quad 0x3FA5555555545D4E
+.L__A2:
+    .quad 0x3FC5555555548F7C
+.L__A1:
+    .quad 0x3FE0000000000000      
+.L__One:
+    .quad 0x3FF0000000000000
+
+.align 16
+# .type	two_to_jby32_table, @object
+# .size	two_to_jby32_table, 512
+S_lead_and_trail_table:
+	.octa  0x00000000000000003FF0000000000000
+	.octa  0x3D0A1D73E2A475B43FF059B0D3158540
+	.octa  0x3CEEC5317256E3083FF0B5586CF98900
+	.octa  0x3CF0A4EBBF1AED933FF11301D0125B40
+	.octa  0x3D0D6E6FBE4628763FF172B83C7D5140
+	.octa  0x3D053C02DC0144C83FF1D4873168B980
+	.octa  0x3D0C3360FD6D8E0B3FF2387A6E756200
+	.octa  0x3D009612E8AFAD123FF29E9DF51FDEC0
+	.octa  0x3CF52DE8D5A463063FF306FE0A31B700
+	.octa  0x3CE54E28AA05E8A93FF371A7373AA9C0
+	.octa  0x3D011ADA0911F09F3FF3DEA64C123400
+	.octa  0x3D068189B7A04EF83FF44E0860618900
+	.octa  0x3D038EA1CBD7F6213FF4BFDAD5362A00
+	.octa  0x3CBDF0A83C49D86A3FF5342B569D4F80
+	.octa  0x3D04AC64980A8C8F3FF5AB07DD485400
+	.octa  0x3CD2C7C3E81BF4B73FF6247EB03A5580
+	.octa  0x3CE921165F626CDD3FF6A09E667F3BC0
+	.octa  0x3D09EE91B87977853FF71F75E8EC5F40
+	.octa  0x3CDB5F54408FDB373FF7A11473EB0180
+	.octa  0x3CF28ACF88AFAB353FF82589994CCE00
+	.octa  0x3CFB5BA7C55A192D3FF8ACE5422AA0C0
+	.octa  0x3D027A280E1F92A03FF93737B0CDC5C0
+	.octa  0x3CF01C7C46B071F33FF9C49182A3F080
+	.octa  0x3CFC8B424491CAF83FFA5503B23E2540
+	.octa  0x3D06AF439A68BB993FFAE89F995AD380
+	.octa  0x3CDBAA9EC206AD4F3FFB7F76F2FB5E40
+	.octa  0x3CFC2220CB12A0923FFC199BDD855280
+	.octa  0x3D048A81E5E8F4A53FFCB720DCEF9040
+	.octa  0x3CDC976816BAD9B83FFD5818DCFBA480
+	.octa  0x3CFEB968CAC39ED33FFDFC97337B9B40
+	.octa  0x3CF9858F73A18F5E3FFEA4AFA2A490C0
+	.octa  0x3C99D3E12DD8A18B3FFF50765B6E4540
+
+.align 16
+.L__Five_Pont_FiveEMinus17:
+    .quad 0x3C90000000000000
+.L__B9:
+    .quad 0x3E5A2836AA646B96
+.L__B8:
+    .quad 0x3E928295484734EA
+.L__B7:
+    .quad 0x3EC71E14BFE3DB59
+.L__B6:
+    .quad 0x3EFA019F635825C4
+.L__B5:
+    .quad 0x3F2A01A01159DD2D
+.L__B4:
+    .quad 0x3F56C16C16CE14C6
+.L__B3:
+    .quad 0x3F8111111111A9F3
+.L__B2:
+    .quad 0x3FA55555555554B6
+.L__B1:
+    .quad 0x3FC5555555555549
+.L__TwopM7:
+    .quad 0x3F80000000000000
diff --git a/src/gas/expm1f.S b/src/gas/expm1f.S
new file mode 100644
index 0000000..6e7ca03
--- /dev/null
+++ b/src/gas/expm1f.S
@@ -0,0 +1,323 @@
+
+#
+#  (C) 2008-2009 Advanced Micro Devices, Inc. All Rights Reserved.
+#
+#  This file is part of libacml_mv.
+#
+#  libacml_mv is free software; you can redistribute it and/or
+#  modify it under the terms of the GNU Lesser General Public
+#  License as published by the Free Software Foundation; either
+#  version 2.1 of the License, or (at your option) any later version.
+#
+#  libacml_mv is distributed in the hope that it will be useful,
+#  but WITHOUT ANY WARRANTY; without even the implied warranty of
+#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#  Lesser General Public License for more details.
+#
+#  You should have received a copy of the GNU Lesser General Public
+#  License along with libacml_mv.  If not, see
+#  <http://www.gnu.org/licenses/>.
+#
+#
+
+
+#include "fn_macros.h"
+#define fname FN_PROTOTYPE(expm1f)
+#define fname_special _expm1f_special@PLT
+
+#ifdef __ELF__
+    .section .note.GNU-stack,"",@progbits
+#endif
+
+	.text
+	.p2align 4
+.globl fname
+	.type	fname, @function
+
+fname:
+    ucomiss .L__max_expm1_arg(%rip),%xmm0         ##if(x > max_expm1_arg)
+    ja .L__Max_Arg
+    jp .L__Max_Arg
+    ucomiss .L__log_OnePlus_OneByFour(%rip),%xmm0 ##if(x < log_OnePlus_OneByFour)
+    jae .L__Normal_Flow
+    ucomiss .L__log_OneMinus_OneByFour(%rip),%xmm0 ##if(x > log_OneMinus_OneByFour)
+    ja .L__Small_Arg
+    ucomiss .L__min_expm1_arg(%rip),%xmm0         ##if(x < min_expm1_arg)
+    jb .L__Min_Arg
+    
+    .p2align 4
+.L__Normal_Flow:
+    movaps %xmm0,%xmm1     #xmm1 = x
+    mulss .L__thirtyTwo_by_ln2(%rip),%xmm1   #xmm1 = x*thirtyTwo_by_ln2
+    movd %xmm1,%eax        #eax = x*thirtyTwo_by_ln2
+    and $0x80000000,%eax   #get the sign of x*thirtyTwo_by_ln2
+    or  $0x3F000000,%eax   #make +/- 0.5    
+    movd %eax,%xmm2        #xmm2 = +/- 0.5
+    addss %xmm2,%xmm1      #xmm1 = (x*32/ln2) +/- 0.5        
+	cvttps2dq %xmm1,%xmm2  #xmm2 = n = (int)(temp)
+	mov $0x0000001f,%edx
+	movd %edx,%xmm1
+	andps %xmm2,%xmm1      #xmm1 = j
+    movd %xmm2,%ecx        #ecx = n	
+	sarl	$5, %ecx       #ecx = m = n >> 5    
+	#xor %rdx,%rdx         #make it zeros, to be used for address	
+	movd %xmm1,%edx        #edx = j
+	lea  S_lead_and_trail_table(%rip),%rax	
+	movsd  (%rax,%rdx,8),%xmm3 #xmm3 = S_T,S_L
+    punpckldq %xmm2,%xmm1  #xmm1 = n,j    	
+	psubd %xmm1,%xmm2      #xmm2 = n1
+    punpcklqdq %xmm2,%xmm1 #xmm1 = n1,n,j    
+	cvtdq2ps %xmm1,%xmm1   #xmm1 = (float)(n1,n,j)
+		
+	#r2 = -(n*ln2_by_ThirtyTwo_trail);
+    #r1 = (x-n1*ln2_by_ThirtyTwo_lead) - j*ln2_by_ThirtyTwo_lead;	
+    mulps .L__Ln2By32_LeadTrailLead(%rip),%xmm1
+    movhlps %xmm1,%xmm2    #xmm2 = n1*ln2/32lead
+    movaps %xmm0,%xmm4     #xmm4 = x
+    subss %xmm2,%xmm4      #xmm4 = x - n1*ln2/32lead
+    subss %xmm1,%xmm4      #xmm4 = r1
+    psrldq $4,%xmm1        #xmm1 = -r2 should take care of sign later
+    
+    #r = r1 + r2;
+    movaps %xmm4,%xmm7     #xmm7 = r1   
+    subss %xmm1,%xmm4      #xmm4 = r = r1-(-r2) = r1 + r2
+    
+    #q = r*r*(B1+r*(B2));
+    movaps %xmm4,%xmm6         #xmm6 = r
+    mulss .L__B2_f(%rip),%xmm6 #xmm6 = r * B2
+    addss .L__B1_f(%rip),%xmm6 #xmm6 = B1 + (r * B2)
+    mulss %xmm4,%xmm6
+    mulss %xmm4,%xmm6          #xmm6 = q    
+    
+    #p = (r2+q) + r1;
+    subss %xmm1,%xmm6
+    addss %xmm7,%xmm6          #xmm6 = p
+
+    #s = S_L.f32 + S_T.f32;    
+    movdqa %xmm3,%xmm2     #xmm2 = S_T,S_L
+    psrldq $4,%xmm2        #xmm2 =     S_T
+    movaps %xmm2,%xmm5     #xmm5 =     S_T
+    addss %xmm3,%xmm2      #xmm2 = s    
+    
+    cmp  $0xfffffff9,%ecx  #Check m < -7
+    jl .L__M_Below_Minus7
+	cmp $23,%ecx           #Check m > 23
+	jg .L__M_Above_23
+	# -8 < m < 24
+    #twopm.f32 * ((S_L.f32 - twopmm.f32) + (S_L.f32*p+ S_T.f32 *(1+p)));
+    movaps %xmm3,%xmm2   #xmm2 = S_L
+    mulss %xmm6,%xmm2     #xmm2 = S_L * p
+    addss .L__One_f(%rip),%xmm6   #xmm6 = 1+p
+    mulss %xmm5,%xmm6     #xmm6 = S_T *(1+p)
+    addss %xmm6,%xmm2     #xmm2 = (S_L.f32*p+ S_T.f32 *(1+p))
+    mov $127,%eax
+    sub %ecx,%eax          #eax = 127 - m
+	shl  $23,%eax          #eax = 2^-m    
+    movd %eax,%xmm1    
+    subss %xmm1,%xmm3     #xmm3 = (S_L.f32 - twopmm.f32)
+    addss %xmm3,%xmm2     #xmm2 = ((S_L.f32 - twopmm.f32) + (S_L.f32*p+ S_T.f32 *(1+p)))   
+    shl  $23,%ecx
+    movd %ecx,%xmm0
+    paddd %xmm2,%xmm0
+    ret     
+   
+    .p2align 4
+.L__M_Below_Minus7:
+    #twopm.f32 * (S_L.f32 + (s*p + S_T.f32)) - 1;
+    mulss %xmm6,%xmm2     #xmm2 = s*p
+    addss %xmm5,%xmm2     #xmm2 = s*p + S_T
+    addss %xmm3,%xmm2     #xmm2 = (S_L.f32 + (s*p + S_T.f32))
+    shl  $23,%ecx  
+    movd %ecx,%xmm0
+    paddd %xmm2,%xmm0
+    subss .L__One_f(%rip),%xmm0            
+    ret  
+            
+    .p2align 4
+.L__M_Above_23:
+    #twopm.f32 * (S_L.f32 + (s*p+(S_T.f32 - twopmm.f32)));
+    cmp  $0x00000080,%ecx  #Check m < 128    
+    je .L__M_Equals_128        
+    cmp  $47,%ecx          #Check m > 47
+    ja .L__M_Above_47        
+    mov $127,%eax
+    sub %ecx,%eax          #eax = 127 - m
+	shl  $23,%eax          #eax = 2^-m    
+    movd %eax,%xmm1
+    subss %xmm1,%xmm5      #xmm5 = S_T.f32 - twopmm.f32
+    
+    .p2align 4
+.L__M_Above_47:    
+    shl  $23,%ecx    
+    mulss %xmm6,%xmm2      #xmm2 = s*p
+    addss %xmm5,%xmm2
+    addss %xmm3,%xmm2
+    movd %ecx,%xmm0
+    paddd %xmm2,%xmm0
+    ret    
+        
+    .p2align 4	
+.L__M_Equals_128:
+    mov $0x3f800000,%ecx  #127 at exponent
+    mulss %xmm6,%xmm2     #xmm2 = s*p
+    addss %xmm5,%xmm2     #xmm2 = s*p + S_T
+    addss %xmm3,%xmm2     #xmm2 = (S_L.f32 + (s*p + S_T.f32))
+    movd %ecx,%xmm1       #127
+    paddd %xmm2,%xmm1     #2^127*(S_L.f32 + (s*p + S_T.f32))
+    mov $0x00800000,%ecx  #multiply with one more 2
+    movd %ecx,%xmm2
+    paddd %xmm2,%xmm1
+    movd %xmm1,%ecx
+    and $0x7f800000,%ecx  #check if we reached +inf
+    cmp $0x7f800000,%ecx
+    je .L__Overflow
+    movdqa %xmm1,%xmm0
+    ret	
+	
+	.p2align 4
+.L__Small_Arg:
+    movd %xmm0,%eax
+    and $0x7fffffff,%eax    #eax = abs(x)
+    cmp $0x33000000,%eax    #check abs(x) < 2^-25
+    jl .L__VeryTiny_Arg