mpn/generic/toom_interpolate_6pts.c - gmp - Git at Google

 /* mpn_toom_interpolate_6pts -- Interpolate for toom43, 52

    Contributed to the GNU project by Marco Bodrato.

    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.

 Copyright 2009, 2010, 2012 Free Software Foundation, Inc.

 This file is part of the GNU MP Library.

 The GNU MP Library is free software; you can redistribute it and/or modify
 it under the terms of either:

   * the GNU Lesser General Public License as published by the Free
     Software Foundation; either version 3 of the License, or (at your
     option) any later version.

 or

   * the GNU General Public License as published by the Free Software
     Foundation; either version 2 of the License, or (at your option) any
     later version.

 or both in parallel, as here.

 The GNU MP Library is distributed in the hope that it will be useful, but
 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 for more details.

 You should have received copies of the GNU General Public License and the
 GNU Lesser General Public License along with the GNU MP Library.  If not,
 see https://www.gnu.org/licenses/.  */

 #include "gmp-impl.h"

 #define BINVERT_3 MODLIMB_INVERSE_3

 /* For odd divisors, mpn_divexact_1 works fine with two's complement. */
 #ifndef mpn_divexact_by3
 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
 #define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,BINVERT_3,0)
 #else
 #define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3)
 #endif
 #endif

 /* Interpolation for Toom-3.5, using the evaluation points: infinity,
    1, -1, 2, -2. More precisely, we want to compute
    f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 5, given the
    six values

      w5 = f(0),
      w4 = f(-1),
      w3 = f(1)
      w2 = f(-2),
      w1 = f(2),
      w0 = limit at infinity of f(x) / x^5,

    The result is stored in {pp, 5*n + w0n}. At entry, w5 is stored at
    {pp, 2n}, w3 is stored at {pp + 2n, 2n+1}, and w0 is stored at
    {pp + 5n, w0n}. The other values are 2n + 1 limbs each (with most
    significant limbs small). f(-1) and f(-2) may be negative, signs
    determined by the flag bits. All intermediate results are positive.
    Inputs are destroyed.

    Interpolation sequence was taken from the paper: "Integer and
    Polynomial Multiplication: Towards Optimal Toom-Cook Matrices".
    Some slight variations were introduced: adaptation to "gmp
    instruction set", and a final saving of an operation by interlacing
    interpolation and recomposition phases.
 */

 void
 mpn_toom_interpolate_6pts (mp_ptr pp, mp_size_t n, enum toom6_flags flags,
 			   mp_ptr w4, mp_ptr w2, mp_ptr w1,
 			   mp_size_t w0n)
 {
   mp_limb_t cy;
   /* cy6 can be stored in w1[2*n], cy4 in w4[0], embankment in w2[0] */
   mp_limb_t cy4, cy6, embankment;

   ASSERT( n > 0 );
   ASSERT( 2*n >= w0n && w0n > 0 );

 #define w5  pp					/* 2n   */
 #define w3  (pp + 2 * n)			/* 2n+1 */
 #define w0  (pp + 5 * n)			/* w0n  */

   /* Interpolate with sequence:
      W2 =(W1 - W2)>>2
      W1 =(W1 - W5)>>1
      W1 =(W1 - W2)>>1
      W4 =(W3 - W4)>>1
      W2 =(W2 - W4)/3
      W3 = W3 - W4 - W5
      W1 =(W1 - W3)/3
      // Last steps are mixed with recomposition...
      W2 = W2 - W0<<2
      W4 = W4 - W2
      W3 = W3 - W1
      W2 = W2 - W0
   */

   /* W2 =(W1 - W2)>>2 */
   if (flags & toom6_vm2_neg)
     mpn_add_n (w2, w1, w2, 2 * n + 1);
   else
     mpn_sub_n (w2, w1, w2, 2 * n + 1);
   mpn_rshift (w2, w2, 2 * n + 1, 2);

   /* W1 =(W1 - W5)>>1 */
   w1[2*n] -= mpn_sub_n (w1, w1, w5, 2*n);
   mpn_rshift (w1, w1, 2 * n + 1, 1);

   /* W1 =(W1 - W2)>>1 */
 #if HAVE_NATIVE_mpn_rsh1sub_n
   mpn_rsh1sub_n (w1, w1, w2, 2 * n + 1);
 #else
   mpn_sub_n (w1, w1, w2, 2 * n + 1);
   mpn_rshift (w1, w1, 2 * n + 1, 1);
 #endif

   /* W4 =(W3 - W4)>>1 */
   if (flags & toom6_vm1_neg)
     {
 #if HAVE_NATIVE_mpn_rsh1add_n
       mpn_rsh1add_n (w4, w3, w4, 2 * n + 1);
 #else
       mpn_add_n (w4, w3, w4, 2 * n + 1);
       mpn_rshift (w4, w4, 2 * n + 1, 1);
 #endif
     }
   else
     {
 #if HAVE_NATIVE_mpn_rsh1sub_n
       mpn_rsh1sub_n (w4, w3, w4, 2 * n + 1);
 #else
       mpn_sub_n (w4, w3, w4, 2 * n + 1);
       mpn_rshift (w4, w4, 2 * n + 1, 1);
 #endif
     }

   /* W2 =(W2 - W4)/3 */
   mpn_sub_n (w2, w2, w4, 2 * n + 1);
   mpn_divexact_by3 (w2, w2, 2 * n + 1);

   /* W3 = W3 - W4 - W5 */
   mpn_sub_n (w3, w3, w4, 2 * n + 1);
   w3[2 * n] -= mpn_sub_n (w3, w3, w5, 2 * n);

   /* W1 =(W1 - W3)/3 */
   mpn_sub_n (w1, w1, w3, 2 * n + 1);
   mpn_divexact_by3 (w1, w1, 2 * n + 1);

   /*
     [1 0 0 0 0 0;
      0 1 0 0 0 0;
      1 0 1 0 0 0;
      0 1 0 1 0 0;
      1 0 1 0 1 0;
      0 0 0 0 0 1]

     pp[] prior to operations:
      |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__|

     summation scheme for remaining operations:
      |______________5|n_____4|n_____3|n_____2|n______|n______|pp
      |_H w0__|_L w0__|______||_H w3__|_L w3__|_H w5__|_L w5__|
 				    || H w4  | L w4  |
 		    || H w2  | L w2  |
 	    || H w1  | L w1  |
 			    ||-H w1  |-L w1  |
 		     |-H w0  |-L w0 ||-H w2  |-L w2  |
   */
   cy = mpn_add_n (pp + n, pp + n, w4, 2 * n + 1);
   MPN_INCR_U (pp + 3 * n + 1, n, cy);

   /* W2 -= W0<<2 */
 #if HAVE_NATIVE_mpn_sublsh_n || HAVE_NATIVE_mpn_sublsh2_n_ip1
 #if HAVE_NATIVE_mpn_sublsh2_n_ip1
   cy = mpn_sublsh2_n_ip1 (w2, w0, w0n);
 #else
   cy = mpn_sublsh_n (w2, w2, w0, w0n, 2);
 #endif
 #else
   /* {W4,2*n+1} is now free and can be overwritten. */
   cy = mpn_lshift(w4, w0, w0n, 2);
   cy+= mpn_sub_n(w2, w2, w4, w0n);
 #endif
   MPN_DECR_U (w2 + w0n, 2 * n + 1 - w0n, cy);

   /* W4L = W4L - W2L */
   cy = mpn_sub_n (pp + n, pp + n, w2, n);
   MPN_DECR_U (w3, 2 * n + 1, cy);

   /* W3H = W3H + W2L */
   cy4 = w3[2 * n] + mpn_add_n (pp + 3 * n, pp + 3 * n, w2, n);
   /* W1L + W2H */
   cy = w2[2 * n] + mpn_add_n (pp + 4 * n, w1, w2 + n, n);
   MPN_INCR_U (w1 + n, n + 1, cy);

   /* W0 = W0 + W1H */
   if (LIKELY (w0n > n))
     cy6 = w1[2 * n] + mpn_add_n (w0, w0, w1 + n, n);
   else
     cy6 = mpn_add_n (w0, w0, w1 + n, w0n);

   /*
     summation scheme for the next operation:
      |...____5|n_____4|n_____3|n_____2|n______|n______|pp
      |...w0___|_w1_w2_|_H w3__|_L w3__|_H w5__|_L w5__|
 		     ...-w0___|-w1_w2 |
   */
   /* if(LIKELY(w0n>n)) the two operands below DO overlap! */
   cy = mpn_sub_n (pp + 2 * n, pp + 2 * n, pp + 4 * n, n + w0n);

   /* embankment is a "dirty trick" to avoid carry/borrow propagation
      beyond allocated memory */
   embankment = w0[w0n - 1] - 1;
   w0[w0n - 1] = 1;
   if (LIKELY (w0n > n)) {
     if (cy4 > cy6)
       MPN_INCR_U (pp + 4 * n, w0n + n, cy4 - cy6);
     else
       MPN_DECR_U (pp + 4 * n, w0n + n, cy6 - cy4);
     MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy);
     MPN_INCR_U (w0 + n, w0n - n, cy6);
   } else {
     MPN_INCR_U (pp + 4 * n, w0n + n, cy4);
     MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy + cy6);
   }
   w0[w0n - 1] += embankment;

 #undef w5
 #undef w3
 #undef w0

 }
	/* mpn_toom_interpolate_6pts -- Interpolate for toom43, 52

	Contributed to the GNU project by Marco Bodrato.

	THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
	SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
	GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.

	Copyright 2009, 2010, 2012 Free Software Foundation, Inc.

	This file is part of the GNU MP Library.

	The GNU MP Library is free software; you can redistribute it and/or modify
	it under the terms of either:

	* the GNU Lesser General Public License as published by the Free
	Software Foundation; either version 3 of the License, or (at your
	option) any later version.

	or

	* the GNU General Public License as published by the Free Software
	Foundation; either version 2 of the License, or (at your option) any
	later version.

	or both in parallel, as here.

	The GNU MP Library is distributed in the hope that it will be useful, but
	WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
	or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
	for more details.

	You should have received copies of the GNU General Public License and the
	GNU Lesser General Public License along with the GNU MP Library. If not,
	see https://www.gnu.org/licenses/. */

	#include "gmp-impl.h"

	#define BINVERT_3 MODLIMB_INVERSE_3

	/* For odd divisors, mpn_divexact_1 works fine with two's complement. */
	#ifndef mpn_divexact_by3
	#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
	#define mpn_divexact_by3(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,3,BINVERT_3,0)
	#else
	#define mpn_divexact_by3(dst,src,size) mpn_divexact_1(dst,src,size,3)
	#endif
	#endif

	/* Interpolation for Toom-3.5, using the evaluation points: infinity,
	1, -1, 2, -2. More precisely, we want to compute
	f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 5, given the
	six values

	w5 = f(0),
	w4 = f(-1),
	w3 = f(1)
	w2 = f(-2),
	w1 = f(2),
	w0 = limit at infinity of f(x) / x^5,

	The result is stored in {pp, 5*n + w0n}. At entry, w5 is stored at
	{pp, 2n}, w3 is stored at {pp + 2n, 2n+1}, and w0 is stored at
	{pp + 5n, w0n}. The other values are 2n + 1 limbs each (with most
	significant limbs small). f(-1) and f(-2) may be negative, signs
	determined by the flag bits. All intermediate results are positive.
	Inputs are destroyed.

	Interpolation sequence was taken from the paper: "Integer and
	Polynomial Multiplication: Towards Optimal Toom-Cook Matrices".
	Some slight variations were introduced: adaptation to "gmp
	instruction set", and a final saving of an operation by interlacing
	interpolation and recomposition phases.
	*/

	void
	mpn_toom_interpolate_6pts (mp_ptr pp, mp_size_t n, enum toom6_flags flags,
	mp_ptr w4, mp_ptr w2, mp_ptr w1,
	mp_size_t w0n)
	{
	mp_limb_t cy;
	/* cy6 can be stored in w1[2n], cy4 in w4[0], embankment in w2[0] /
	mp_limb_t cy4, cy6, embankment;

	ASSERT( n > 0 );
	ASSERT( 2*n >= w0n && w0n > 0 );

	#define w5 pp /* 2n */
	#define w3 (pp + 2 * n) /* 2n+1 */
	#define w0 (pp + 5 * n) /* w0n */

	/* Interpolate with sequence:
	W2 =(W1 - W2)>>2
	W1 =(W1 - W5)>>1
	W1 =(W1 - W2)>>1
	W4 =(W3 - W4)>>1
	W2 =(W2 - W4)/3
	W3 = W3 - W4 - W5
	W1 =(W1 - W3)/3
	// Last steps are mixed with recomposition...
	W2 = W2 - W0<<2
	W4 = W4 - W2
	W3 = W3 - W1
	W2 = W2 - W0
	*/

	/* W2 =(W1 - W2)>>2 */
	if (flags & toom6_vm2_neg)
	mpn_add_n (w2, w1, w2, 2 * n + 1);
	else
	mpn_sub_n (w2, w1, w2, 2 * n + 1);
	mpn_rshift (w2, w2, 2 * n + 1, 2);

	/* W1 =(W1 - W5)>>1 */
	w1[2n] -= mpn_sub_n (w1, w1, w5, 2n);
	mpn_rshift (w1, w1, 2 * n + 1, 1);

	/* W1 =(W1 - W2)>>1 */
	#if HAVE_NATIVE_mpn_rsh1sub_n
	mpn_rsh1sub_n (w1, w1, w2, 2 * n + 1);
	#else
	mpn_sub_n (w1, w1, w2, 2 * n + 1);
	mpn_rshift (w1, w1, 2 * n + 1, 1);
	#endif

	/* W4 =(W3 - W4)>>1 */
	if (flags & toom6_vm1_neg)
	{
	#if HAVE_NATIVE_mpn_rsh1add_n
	mpn_rsh1add_n (w4, w3, w4, 2 * n + 1);
	#else
	mpn_add_n (w4, w3, w4, 2 * n + 1);
	mpn_rshift (w4, w4, 2 * n + 1, 1);
	#endif
	}
	else
	{
	#if HAVE_NATIVE_mpn_rsh1sub_n
	mpn_rsh1sub_n (w4, w3, w4, 2 * n + 1);
	#else
	mpn_sub_n (w4, w3, w4, 2 * n + 1);
	mpn_rshift (w4, w4, 2 * n + 1, 1);
	#endif
	}

	/* W2 =(W2 - W4)/3 */
	mpn_sub_n (w2, w2, w4, 2 * n + 1);
	mpn_divexact_by3 (w2, w2, 2 * n + 1);

	/* W3 = W3 - W4 - W5 */
	mpn_sub_n (w3, w3, w4, 2 * n + 1);
	w3[2 * n] -= mpn_sub_n (w3, w3, w5, 2 * n);

	/* W1 =(W1 - W3)/3 */
	mpn_sub_n (w1, w1, w3, 2 * n + 1);
	mpn_divexact_by3 (w1, w1, 2 * n + 1);

	/*
	[1 0 0 0 0 0;
	0 1 0 0 0 0;
	1 0 1 0 0 0;
	0 1 0 1 0 0;
	1 0 1 0 1 0;
	0 0 0 0 0 1]

	pp[] prior to operations:
	\|_H w0__\|_L w0__\|______\|\|_H w3__\|_L w3__\|_H w5__\|_L w5__\|

	summation scheme for remaining operations:
	\|______________5\|n_____4\|n_____3\|n_____2\|n______\|n______\|pp
	\|_H w0__\|_L w0__\|______\|\|_H w3__\|_L w3__\|_H w5__\|_L w5__\|
	\|\| H w4 \| L w4 \|
	\|\| H w2 \| L w2 \|
	\|\| H w1 \| L w1 \|
	\|\|-H w1 \|-L w1 \|
	\|-H w0 \|-L w0 \|\|-H w2 \|-L w2 \|
	*/
	cy = mpn_add_n (pp + n, pp + n, w4, 2 * n + 1);
	MPN_INCR_U (pp + 3 * n + 1, n, cy);

	/* W2 -= W0<<2 */
	#if HAVE_NATIVE_mpn_sublsh_n \|\| HAVE_NATIVE_mpn_sublsh2_n_ip1
	#if HAVE_NATIVE_mpn_sublsh2_n_ip1
	cy = mpn_sublsh2_n_ip1 (w2, w0, w0n);
	#else
	cy = mpn_sublsh_n (w2, w2, w0, w0n, 2);
	#endif
	#else
	/* {W4,2n+1} is now free and can be overwritten. /
	cy = mpn_lshift(w4, w0, w0n, 2);
	cy+= mpn_sub_n(w2, w2, w4, w0n);
	#endif
	MPN_DECR_U (w2 + w0n, 2 * n + 1 - w0n, cy);

	/* W4L = W4L - W2L */
	cy = mpn_sub_n (pp + n, pp + n, w2, n);
	MPN_DECR_U (w3, 2 * n + 1, cy);

	/* W3H = W3H + W2L */
	cy4 = w3[2 * n] + mpn_add_n (pp + 3 * n, pp + 3 * n, w2, n);
	/* W1L + W2H */
	cy = w2[2 * n] + mpn_add_n (pp + 4 * n, w1, w2 + n, n);
	MPN_INCR_U (w1 + n, n + 1, cy);

	/* W0 = W0 + W1H */
	if (LIKELY (w0n > n))
	cy6 = w1[2 * n] + mpn_add_n (w0, w0, w1 + n, n);
	else
	cy6 = mpn_add_n (w0, w0, w1 + n, w0n);

	/*
	summation scheme for the next operation:
	\|...____5\|n_____4\|n_____3\|n_____2\|n______\|n______\|pp
	\|...w0___\|_w1_w2_\|_H w3__\|_L w3__\|_H w5__\|_L w5__\|
	...-w0___\|-w1_w2 \|
	*/
	/* if(LIKELY(w0n>n)) the two operands below DO overlap! */
	cy = mpn_sub_n (pp + 2 * n, pp + 2 * n, pp + 4 * n, n + w0n);

	/* embankment is a "dirty trick" to avoid carry/borrow propagation
	beyond allocated memory */
	embankment = w0[w0n - 1] - 1;
	w0[w0n - 1] = 1;
	if (LIKELY (w0n > n)) {
	if (cy4 > cy6)
	MPN_INCR_U (pp + 4 * n, w0n + n, cy4 - cy6);
	else
	MPN_DECR_U (pp + 4 * n, w0n + n, cy6 - cy4);
	MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy);
	MPN_INCR_U (w0 + n, w0n - n, cy6);
	} else {
	MPN_INCR_U (pp + 4 * n, w0n + n, cy4);
	MPN_DECR_U (pp + 3 * n + w0n, 2 * n, cy + cy6);
	}
	w0[w0n - 1] += embankment;

	#undef w5
	#undef w3
	#undef w0

	}