toom_interpolate_5pts.c revision 1.1.1.1
1/* mpn_toom_interpolate_5pts -- Interpolate for toom3, 33, 42.
2
3   Contributed to the GNU project by Robert Harley.
4   Improvements by Paul Zimmermann and Marco Bodrato.
5
6   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
7   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
8   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9
10Copyright 2000, 2001, 2002, 2003, 2005, 2006, 2007, 2009 Free Software
11Foundation, Inc.
12
13This file is part of the GNU MP Library.
14
15The GNU MP Library is free software; you can redistribute it and/or modify it
16under the terms of the GNU Lesser General Public License as published by the
17Free Software Foundation; either version 3 of the License, or (at your
18option) any later version.
19
20The GNU MP Library is distributed in the hope that it will be useful, but
21WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
22FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
23for more details.
24
25You should have received a copy of the GNU Lesser General Public License
26along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
27
28#include "gmp.h"
29#include "gmp-impl.h"
30
31void
32mpn_toom_interpolate_5pts (mp_ptr c, mp_ptr v2, mp_ptr vm1,
33			   mp_size_t k, mp_size_t twor, int sa,
34			   mp_limb_t vinf0)
35{
36  mp_limb_t cy, saved;
37  mp_size_t twok;
38  mp_size_t kk1;
39  mp_ptr c1, v1, c3, vinf;
40
41  twok = k + k;
42  kk1 = twok + 1;
43
44  c1 = c  + k;
45  v1 = c1 + k;
46  c3 = v1 + k;
47  vinf = c3 + k;
48
49#define v0 (c)
50  /* (1) v2 <- v2-vm1 < v2+|vm1|,       (16 8 4 2 1) - (1 -1 1 -1  1) =
51     thus 0 <= v2 < 50*B^(2k) < 2^6*B^(2k)             (15 9 3  3  0)
52  */
53  if (sa)
54    ASSERT_NOCARRY (mpn_add_n (v2, v2, vm1, kk1));
55  else
56    ASSERT_NOCARRY (mpn_sub_n (v2, v2, vm1, kk1));
57
58  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
59       v0       v1       hi(vinf)       |vm1|     v2-vm1      EMPTY */
60
61  ASSERT_NOCARRY (mpn_divexact_by3 (v2, v2, kk1));    /* v2 <- v2 / 3 */
62						      /* (5 3 1 1 0)*/
63
64  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
65       v0       v1      hi(vinf)       |vm1|     (v2-vm1)/3    EMPTY */
66
67  /* (2) vm1 <- tm1 := (v1 - vm1) / 2  [(1 1 1 1 1) - (1 -1 1 -1 1)] / 2 =
68     tm1 >= 0                                         (0  1 0  1 0)
69     No carry comes out from {v1, kk1} +/- {vm1, kk1},
70     and the division by two is exact.
71     If (sa!=0) the sign of vm1 is negative */
72  if (sa)
73    {
74#ifdef HAVE_NATIVE_mpn_rsh1add_n
75      mpn_rsh1add_n (vm1, v1, vm1, kk1);
76#else
77      ASSERT_NOCARRY (mpn_add_n (vm1, v1, vm1, kk1));
78      ASSERT_NOCARRY (mpn_rshift (vm1, vm1, kk1, 1));
79#endif
80    }
81  else
82    {
83#ifdef HAVE_NATIVE_mpn_rsh1sub_n
84      mpn_rsh1sub_n (vm1, v1, vm1, kk1);
85#else
86      ASSERT_NOCARRY (mpn_sub_n (vm1, v1, vm1, kk1));
87      ASSERT_NOCARRY (mpn_rshift (vm1, vm1, kk1, 1));
88#endif
89    }
90
91  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
92       v0       v1        hi(vinf)       tm1     (v2-vm1)/3    EMPTY */
93
94  /* (3) v1 <- t1 := v1 - v0    (1 1 1 1 1) - (0 0 0 0 1) = (1 1 1 1 0)
95     t1 >= 0
96  */
97  vinf[0] -= mpn_sub_n (v1, v1, c, twok);
98
99  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
100       v0     v1-v0        hi(vinf)       tm1     (v2-vm1)/3    EMPTY */
101
102  /* (4) v2 <- t2 := ((v2-vm1)/3-t1)/2 = (v2-vm1-3*t1)/6
103     t2 >= 0                  [(5 3 1 1 0) - (1 1 1 1 0)]/2 = (2 1 0 0 0)
104  */
105#ifdef HAVE_NATIVE_mpn_rsh1sub_n
106  mpn_rsh1sub_n (v2, v2, v1, kk1);
107#else
108  ASSERT_NOCARRY (mpn_sub_n (v2, v2, v1, kk1));
109  ASSERT_NOCARRY (mpn_rshift (v2, v2, kk1, 1));
110#endif
111
112  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
113       v0     v1-v0        hi(vinf)     tm1    (v2-vm1-3t1)/6    EMPTY */
114
115  /* (5) v1 <- t1-tm1           (1 1 1 1 0) - (0 1 0 1 0) = (1 0 1 0 0)
116     result is v1 >= 0
117  */
118  ASSERT_NOCARRY (mpn_sub_n (v1, v1, vm1, kk1));
119
120  /* We do not need to read the value in vm1, so we add it in {c+k, ...} */
121  cy = mpn_add_n (c1, c1, vm1, kk1);
122  MPN_INCR_U (c3 + 1, twor + k - 1, cy); /* 2n-(3k+1) = 2r+k-1 */
123  /* Memory allocated for vm1 is now free, it can be recycled ...*/
124
125  /* (6) v2 <- v2 - 2*vinf,     (2 1 0 0 0) - 2*(1 0 0 0 0) = (0 1 0 0 0)
126     result is v2 >= 0 */
127  saved = vinf[0];       /* Remember v1's highest byte (will be overwritten). */
128  vinf[0] = vinf0;       /* Set the right value for vinf0                     */
129#ifdef HAVE_NATIVE_mpn_sublsh1_n
130  cy = mpn_sublsh1_n (v2, v2, vinf, twor);
131#else
132  /* Overwrite unused vm1 */
133  cy = mpn_lshift (vm1, vinf, twor, 1);
134  cy += mpn_sub_n (v2, v2, vm1, twor);
135#endif
136  MPN_DECR_U (v2 + twor, kk1 - twor, cy);
137
138  /* Current matrix is
139     [1 0 0 0 0; vinf
140      0 1 0 0 0; v2
141      1 0 1 0 0; v1
142      0 1 0 1 0; vm1
143      0 0 0 0 1] v0
144     Some vaues already are in-place (we added vm1 in the correct position)
145     | vinf|  v1 |  v0 |
146	      | vm1 |
147     One still is in a separated area
148	| +v2 |
149     We have to compute v1-=vinf; vm1 -= v2,
150	   |-vinf|
151	      | -v2 |
152     Carefully reordering operations we can avoid to compute twice the sum
153     of the high half of v2 plus the low half of vinf.
154  */
155
156  /* Add the high half of t2 in {vinf} */
157  if ( LIKELY(twor > k + 1) ) { /* This is the expected flow  */
158    cy = mpn_add_n (vinf, vinf, v2 + k, k + 1);
159    MPN_INCR_U (c3 + kk1, twor - k - 1, cy); /* 2n-(5k+1) = 2r-k-1 */
160  } else { /* triggered only by very unbalanced cases like
161	      (k+k+(k-2))x(k+k+1) , should be handled by toom32 */
162    ASSERT_NOCARRY (mpn_add_n (vinf, vinf, v2 + k, twor));
163  }
164  /* (7) v1 <- v1 - vinf,       (1 0 1 0 0) - (1 0 0 0 0) = (0 0 1 0 0)
165     result is >= 0 */
166  /* Side effect: we also subtracted (high half) vm1 -= v2 */
167  cy = mpn_sub_n (v1, v1, vinf, twor);          /* vinf is at most twor long.  */
168  vinf0 = vinf[0];                     /* Save again the right value for vinf0 */
169  vinf[0] = saved;
170  MPN_DECR_U (v1 + twor, kk1 - twor, cy);       /* Treat the last bytes.       */
171
172  /* (8) vm1 <- vm1-v2          (0 1 0 1 0) - (0 1 0 0 0) = (0 0 0 1 0)
173     Operate only on the low half.
174  */
175  cy = mpn_sub_n (c1, c1, v2, k);
176  MPN_DECR_U (v1, kk1, cy);
177
178  /********************* Beginning the final phase **********************/
179
180  /* Most of the recomposition was done */
181
182  /* add t2 in {c+3k, ...}, but only the low half */
183  cy = mpn_add_n (c3, c3, v2, k);
184  vinf[0] += cy;
185  ASSERT(vinf[0] >= cy); /* No carry */
186  MPN_INCR_U (vinf, twor, vinf0); /* Add vinf0, propagate carry. */
187
188#undef v0
189}
190