toom_interpolate_5pts.c revision 1.1.1.3
1/* mpn_toom_interpolate_5pts -- Interpolate for toom3, 33, 42.
2
3   Contributed to the GNU project by Robert Harley.
4   Improvements by Paul Zimmermann and Marco Bodrato.
5
6   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
7   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
8   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9
10Copyright 2000-2003, 2005-2007, 2009 Free Software Foundation, Inc.
11
12This file is part of the GNU MP Library.
13
14The GNU MP Library is free software; you can redistribute it and/or modify
15it under the terms of either:
16
17  * the GNU Lesser General Public License as published by the Free
18    Software Foundation; either version 3 of the License, or (at your
19    option) any later version.
20
21or
22
23  * the GNU General Public License as published by the Free Software
24    Foundation; either version 2 of the License, or (at your option) any
25    later version.
26
27or both in parallel, as here.
28
29The GNU MP Library is distributed in the hope that it will be useful, but
30WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
31or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
32for more details.
33
34You should have received copies of the GNU General Public License and the
35GNU Lesser General Public License along with the GNU MP Library.  If not,
36see https://www.gnu.org/licenses/.  */
37
38#include "gmp.h"
39#include "gmp-impl.h"
40
41void
42mpn_toom_interpolate_5pts (mp_ptr c, mp_ptr v2, mp_ptr vm1,
43			   mp_size_t k, mp_size_t twor, int sa,
44			   mp_limb_t vinf0)
45{
46  mp_limb_t cy, saved;
47  mp_size_t twok;
48  mp_size_t kk1;
49  mp_ptr c1, v1, c3, vinf;
50
51  twok = k + k;
52  kk1 = twok + 1;
53
54  c1 = c  + k;
55  v1 = c1 + k;
56  c3 = v1 + k;
57  vinf = c3 + k;
58
59#define v0 (c)
60  /* (1) v2 <- v2-vm1 < v2+|vm1|,       (16 8 4 2 1) - (1 -1 1 -1  1) =
61     thus 0 <= v2 < 50*B^(2k) < 2^6*B^(2k)             (15 9 3  3  0)
62  */
63  if (sa)
64    ASSERT_NOCARRY (mpn_add_n (v2, v2, vm1, kk1));
65  else
66    ASSERT_NOCARRY (mpn_sub_n (v2, v2, vm1, kk1));
67
68  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
69       v0       v1       hi(vinf)       |vm1|     v2-vm1      EMPTY */
70
71  ASSERT_NOCARRY (mpn_divexact_by3 (v2, v2, kk1));    /* v2 <- v2 / 3 */
72						      /* (5 3 1 1 0)*/
73
74  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
75       v0       v1      hi(vinf)       |vm1|     (v2-vm1)/3    EMPTY */
76
77  /* (2) vm1 <- tm1 := (v1 - vm1) / 2  [(1 1 1 1 1) - (1 -1 1 -1 1)] / 2 =
78     tm1 >= 0                                         (0  1 0  1 0)
79     No carry comes out from {v1, kk1} +/- {vm1, kk1},
80     and the division by two is exact.
81     If (sa!=0) the sign of vm1 is negative */
82  if (sa)
83    {
84#ifdef HAVE_NATIVE_mpn_rsh1add_n
85      mpn_rsh1add_n (vm1, v1, vm1, kk1);
86#else
87      ASSERT_NOCARRY (mpn_add_n (vm1, v1, vm1, kk1));
88      ASSERT_NOCARRY (mpn_rshift (vm1, vm1, kk1, 1));
89#endif
90    }
91  else
92    {
93#ifdef HAVE_NATIVE_mpn_rsh1sub_n
94      mpn_rsh1sub_n (vm1, v1, vm1, kk1);
95#else
96      ASSERT_NOCARRY (mpn_sub_n (vm1, v1, vm1, kk1));
97      ASSERT_NOCARRY (mpn_rshift (vm1, vm1, kk1, 1));
98#endif
99    }
100
101  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
102       v0       v1        hi(vinf)       tm1     (v2-vm1)/3    EMPTY */
103
104  /* (3) v1 <- t1 := v1 - v0    (1 1 1 1 1) - (0 0 0 0 1) = (1 1 1 1 0)
105     t1 >= 0
106  */
107  vinf[0] -= mpn_sub_n (v1, v1, c, twok);
108
109  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
110       v0     v1-v0        hi(vinf)       tm1     (v2-vm1)/3    EMPTY */
111
112  /* (4) v2 <- t2 := ((v2-vm1)/3-t1)/2 = (v2-vm1-3*t1)/6
113     t2 >= 0                  [(5 3 1 1 0) - (1 1 1 1 0)]/2 = (2 1 0 0 0)
114  */
115#ifdef HAVE_NATIVE_mpn_rsh1sub_n
116  mpn_rsh1sub_n (v2, v2, v1, kk1);
117#else
118  ASSERT_NOCARRY (mpn_sub_n (v2, v2, v1, kk1));
119  ASSERT_NOCARRY (mpn_rshift (v2, v2, kk1, 1));
120#endif
121
122  /* {c,2k} {c+2k,2k+1} {c+4k+1,2r-1} {t,2k+1} {t+2k+1,2k+1} {t+4k+2,2r}
123       v0     v1-v0        hi(vinf)     tm1    (v2-vm1-3t1)/6    EMPTY */
124
125  /* (5) v1 <- t1-tm1           (1 1 1 1 0) - (0 1 0 1 0) = (1 0 1 0 0)
126     result is v1 >= 0
127  */
128  ASSERT_NOCARRY (mpn_sub_n (v1, v1, vm1, kk1));
129
130  /* We do not need to read the value in vm1, so we add it in {c+k, ...} */
131  cy = mpn_add_n (c1, c1, vm1, kk1);
132  MPN_INCR_U (c3 + 1, twor + k - 1, cy); /* 2n-(3k+1) = 2r+k-1 */
133  /* Memory allocated for vm1 is now free, it can be recycled ...*/
134
135  /* (6) v2 <- v2 - 2*vinf,     (2 1 0 0 0) - 2*(1 0 0 0 0) = (0 1 0 0 0)
136     result is v2 >= 0 */
137  saved = vinf[0];       /* Remember v1's highest byte (will be overwritten). */
138  vinf[0] = vinf0;       /* Set the right value for vinf0                     */
139#ifdef HAVE_NATIVE_mpn_sublsh1_n_ip1
140  cy = mpn_sublsh1_n_ip1 (v2, vinf, twor);
141#else
142  /* Overwrite unused vm1 */
143  cy = mpn_lshift (vm1, vinf, twor, 1);
144  cy += mpn_sub_n (v2, v2, vm1, twor);
145#endif
146  MPN_DECR_U (v2 + twor, kk1 - twor, cy);
147
148  /* Current matrix is
149     [1 0 0 0 0; vinf
150      0 1 0 0 0; v2
151      1 0 1 0 0; v1
152      0 1 0 1 0; vm1
153      0 0 0 0 1] v0
154     Some values already are in-place (we added vm1 in the correct position)
155     | vinf|  v1 |  v0 |
156	      | vm1 |
157     One still is in a separated area
158	| +v2 |
159     We have to compute v1-=vinf; vm1 -= v2,
160	   |-vinf|
161	      | -v2 |
162     Carefully reordering operations we can avoid to compute twice the sum
163     of the high half of v2 plus the low half of vinf.
164  */
165
166  /* Add the high half of t2 in {vinf} */
167  if ( LIKELY(twor > k + 1) ) { /* This is the expected flow  */
168    cy = mpn_add_n (vinf, vinf, v2 + k, k + 1);
169    MPN_INCR_U (c3 + kk1, twor - k - 1, cy); /* 2n-(5k+1) = 2r-k-1 */
170  } else { /* triggered only by very unbalanced cases like
171	      (k+k+(k-2))x(k+k+1) , should be handled by toom32 */
172    ASSERT_NOCARRY (mpn_add_n (vinf, vinf, v2 + k, twor));
173  }
174  /* (7) v1 <- v1 - vinf,       (1 0 1 0 0) - (1 0 0 0 0) = (0 0 1 0 0)
175     result is >= 0 */
176  /* Side effect: we also subtracted (high half) vm1 -= v2 */
177  cy = mpn_sub_n (v1, v1, vinf, twor);          /* vinf is at most twor long.  */
178  vinf0 = vinf[0];                     /* Save again the right value for vinf0 */
179  vinf[0] = saved;
180  MPN_DECR_U (v1 + twor, kk1 - twor, cy);       /* Treat the last bytes.       */
181
182  /* (8) vm1 <- vm1-v2          (0 1 0 1 0) - (0 1 0 0 0) = (0 0 0 1 0)
183     Operate only on the low half.
184  */
185  cy = mpn_sub_n (c1, c1, v2, k);
186  MPN_DECR_U (v1, kk1, cy);
187
188  /********************* Beginning the final phase **********************/
189
190  /* Most of the recomposition was done */
191
192  /* add t2 in {c+3k, ...}, but only the low half */
193  cy = mpn_add_n (c3, c3, v2, k);
194  vinf[0] += cy;
195  ASSERT(vinf[0] >= cy); /* No carry */
196  MPN_INCR_U (vinf, twor, vinf0); /* Add vinf0, propagate carry. */
197
198#undef v0
199}
200