1/* Interpolaton for the algorithm Toom-Cook 8.5-way.
2
3   Contributed to the GNU project by Marco Bodrato.
4
5   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
6   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
7   GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
8
9Copyright 2009, 2010 Free Software Foundation, Inc.
10
11This file is part of the GNU MP Library.
12
13The GNU MP Library is free software; you can redistribute it and/or modify
14it under the terms of the GNU Lesser General Public License as published by
15the Free Software Foundation; either version 3 of the License, or (at your
16option) any later version.
17
18The GNU MP Library is distributed in the hope that it will be useful, but
19WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
20or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
21License for more details.
22
23You should have received a copy of the GNU Lesser General Public License
24along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
25
26
27#include "gmp.h"
28#include "gmp-impl.h"
29
30#if GMP_NUMB_BITS < 29
31#error Not implemented: Both sublsh_n(,,,28) should be corrected; r2 and r5 need one more LIMB.
32#endif
33
34#if GMP_NUMB_BITS < 28
35#error Not implemented: divexact_by188513325 and _by182712915 will not work.
36#endif
37
38
39#if HAVE_NATIVE_mpn_sublsh_n
40#define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s)
41#else
42static mp_limb_t
43DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
44{
45#if USE_MUL_1 && 0
46  return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
47#else
48  mp_limb_t __cy;
49  __cy = mpn_lshift(ws,src,n,s);
50  return    __cy + mpn_sub_n(dst,dst,ws,n);
51#endif
52}
53#endif
54
55#if HAVE_NATIVE_mpn_addlsh_n
56#define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s)
57#else
58static mp_limb_t
59DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
60{
61#if USE_MUL_1 && 0
62  return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s));
63#else
64  mp_limb_t __cy;
65  __cy = mpn_lshift(ws,src,n,s);
66  return    __cy + mpn_add_n(dst,dst,ws,n);
67#endif
68}
69#endif
70
71#if HAVE_NATIVE_mpn_subrsh
72#define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s)
73#else
74/* FIXME: This is not a correct definition, it assumes no carry */
75#define DO_mpn_subrsh(dst,nd,src,ns,s,ws)				\
76do {									\
77  mp_limb_t __cy;							\
78  MPN_DECR_U (dst, nd, src[0] >> s);					\
79  __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws);	\
80  MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy);				\
81} while (0)
82#endif
83
84
85/* FIXME: tuneup should decide the best variant */
86#ifndef AORSMUL_FASTER_AORS_AORSLSH
87#define AORSMUL_FASTER_AORS_AORSLSH 1
88#endif
89#ifndef AORSMUL_FASTER_AORS_2AORSLSH
90#define AORSMUL_FASTER_AORS_2AORSLSH 1
91#endif
92#ifndef AORSMUL_FASTER_2AORSLSH
93#define AORSMUL_FASTER_2AORSLSH 1
94#endif
95#ifndef AORSMUL_FASTER_3AORSLSH
96#define AORSMUL_FASTER_3AORSLSH 1
97#endif
98
99#if GMP_NUMB_BITS < 43
100#define BIT_CORRECTION 1
101#define CORRECTION_BITS GMP_NUMB_BITS
102#else
103#define BIT_CORRECTION 0
104#define CORRECTION_BITS 0
105#endif
106
107#define BINVERT_9 \
108  ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
109
110#define BINVERT_255 \
111  (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8)))
112
113  /* FIXME: find some more general expressions for inverses */
114#if GMP_LIMB_BITS == 32
115#define BINVERT_2835  (GMP_NUMB_MASK &		CNST_LIMB(0x53E3771B))
116#define BINVERT_42525 (GMP_NUMB_MASK &		CNST_LIMB(0x9F314C35))
117#define BINVERT_182712915 (GMP_NUMB_MASK &	CNST_LIMB(0x550659DB))
118#define BINVERT_188513325 (GMP_NUMB_MASK &	CNST_LIMB(0xFBC333A5))
119#define BINVERT_255x182712915L (GMP_NUMB_MASK &	CNST_LIMB(0x6FC4CB25))
120#define BINVERT_255x188513325L (GMP_NUMB_MASK &	CNST_LIMB(0x6864275B))
121#if GMP_NAIL_BITS == 0
122#define BINVERT_255x182712915H CNST_LIMB(0x1B649A07)
123#define BINVERT_255x188513325H CNST_LIMB(0x06DB993A)
124#else /* GMP_NAIL_BITS != 0 */
125#define BINVERT_255x182712915H \
126  (GMP_NUMB_MASK & CNST_LIMB((0x1B649A07<<GMP_NAIL_BITS) | (0x6FC4CB25>>GMP_NUMB_BITS)))
127#define BINVERT_255x188513325H \
128  (GMP_NUMB_MASK & CNST_LIMB((0x06DB993A<<GMP_NAIL_BITS) | (0x6864275B>>GMP_NUMB_BITS)))
129#endif
130#else
131#if GMP_LIMB_BITS == 64
132#define BINVERT_2835  (GMP_NUMB_MASK &	CNST_LIMB(0x938CC70553E3771B))
133#define BINVERT_42525 (GMP_NUMB_MASK &	CNST_LIMB(0xE7B40D449F314C35))
134#define BINVERT_255x182712915  (GMP_NUMB_MASK &	CNST_LIMB(0x1B649A076FC4CB25))
135#define BINVERT_255x188513325  (GMP_NUMB_MASK &	CNST_LIMB(0x06DB993A6864275B))
136#endif
137#endif
138
139#ifndef mpn_divexact_by255
140#if GMP_NUMB_BITS % 8 == 0
141#define mpn_divexact_by255(dst,src,size) \
142  (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255)))
143#else
144#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
145#define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0)
146#else
147#define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255))
148#endif
149#endif
150#endif
151
152#ifndef mpn_divexact_by255x4
153#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
154#define mpn_divexact_by255x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,2)
155#else
156#define mpn_divexact_by255x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255)<<2)
157#endif
158#endif
159
160#ifndef mpn_divexact_by9x16
161#if HAVE_NATIVE_mpn_pi1_bdiv_q_1
162#define mpn_divexact_by9x16(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,4)
163#else
164#define mpn_divexact_by9x16(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<4)
165#endif
166#endif
167
168#ifndef mpn_divexact_by42525x16
169#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525)
170#define mpn_divexact_by42525x16(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,4)
171#else
172#define mpn_divexact_by42525x16(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525)<<4)
173#endif
174#endif
175
176#ifndef mpn_divexact_by2835x64
177#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835)
178#define mpn_divexact_by2835x64(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,6)
179#else
180#define mpn_divexact_by2835x64(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<6)
181#endif
182#endif
183
184#ifndef  mpn_divexact_by255x182712915
185#if GMP_NUMB_BITS < 36
186#if HAVE_NATIVE_mpn_bdiv_q_2_pi2 && defined(BINVERT_255x182712915H)
187/* FIXME: use mpn_bdiv_q_2_pi2 */
188#endif
189#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_182712915)
190#define mpn_divexact_by255x182712915(dst,src,size)				\
191  do {										\
192    mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(182712915),BINVERT_182712915,0);	\
193    mpn_divexact_by255(dst,dst,size);						\
194  } while(0)
195#else
196#define mpn_divexact_by255x182712915(dst,src,size)	\
197  do {							\
198    mpn_divexact_1(dst,src,size,CNST_LIMB(182712915));	\
199    mpn_divexact_by255(dst,dst,size);			\
200  } while(0)
201#endif
202#else /* GMP_NUMB_BITS > 35 */
203#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_255x182712915)
204#define mpn_divexact_by255x182712915(dst,src,size) \
205  mpn_pi1_bdiv_q_1(dst,src,size,255*CNST_LIMB(182712915),BINVERT_255x182712915,0)
206#else
207#define mpn_divexact_by255x182712915(dst,src,size) mpn_divexact_1(dst,src,size,255*CNST_LIMB(182712915))
208#endif
209#endif /* GMP_NUMB_BITS >?< 36 */
210#endif
211
212#ifndef  mpn_divexact_by255x188513325
213#if GMP_NUMB_BITS < 36
214#if HAVE_NATIVE_mpn_bdiv_q_1_pi2 && defined(BINVERT_255x188513325H)
215/* FIXME: use mpn_bdiv_q_1_pi2 */
216#endif
217#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_188513325)
218#define mpn_divexact_by255x188513325(dst,src,size)			\
219  do {									\
220    mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(188513325),BINVERT_188513325,0);	\
221    mpn_divexact_by255(dst,dst,size);					\
222  } while(0)
223#else
224#define mpn_divexact_by255x188513325(dst,src,size)	\
225  do {							\
226    mpn_divexact_1(dst,src,size,CNST_LIMB(188513325));	\
227    mpn_divexact_by255(dst,dst,size);			\
228  } while(0)
229#endif
230#else /* GMP_NUMB_BITS > 35 */
231#if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_255x188513325)
232#define mpn_divexact_by255x188513325(dst,src,size) \
233  mpn_pi1_bdiv_q_1(dst,src,size,255*CNST_LIMB(188513325),BINVERT_255x188513325,0)
234#else
235#define mpn_divexact_by255x188513325(dst,src,size) mpn_divexact_1(dst,src,size,255*CNST_LIMB(188513325))
236#endif
237#endif /* GMP_NUMB_BITS >?< 36 */
238#endif
239
240/* Interpolation for Toom-8.5 (or Toom-8), using the evaluation
241   points: infinity(8.5 only), +-8, +-4, +-2, +-1, +-1/4, +-1/2,
242   +-1/8, 0. More precisely, we want to compute
243   f(2^(GMP_NUMB_BITS * n)) for a polynomial f of degree 15 (or
244   14), given the 16 (rsp. 15) values:
245
246     r0 = limit at infinity of f(x) / x^7,
247     r1 = f(8),f(-8),
248     r2 = f(4),f(-4),
249     r3 = f(2),f(-2),
250     r4 = f(1),f(-1),
251     r5 = f(1/4),f(-1/4),
252     r6 = f(1/2),f(-1/2),
253     r7 = f(1/8),f(-1/8),
254     r8 = f(0).
255
256   All couples of the form f(n),f(-n) must be already mixed with
257   toom_couple_handling(f(n),...,f(-n),...)
258
259   The result is stored in {pp, spt + 7*n (or 8*n)}.
260   At entry, r8 is stored at {pp, 2n},
261   r6 is stored at {pp + 3n, 3n + 1}.
262   r4 is stored at {pp + 7n, 3n + 1}.
263   r2 is stored at {pp +11n, 3n + 1}.
264   r0 is stored at {pp +15n, spt}.
265
266   The other values are 3n+1 limbs each (with most significant limbs small).
267
268   Negative intermediate results are stored two-complemented.
269   Inputs are destroyed.
270*/
271
272void
273mpn_toom_interpolate_16pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5, mp_ptr r7,
274			mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
275{
276  mp_limb_t cy;
277  mp_size_t n3;
278  mp_size_t n3p1;
279  n3 = 3 * n;
280  n3p1 = n3 + 1;
281
282#define   r6    (pp + n3)			/* 3n+1 */
283#define   r4    (pp + 7 * n)			/* 3n+1 */
284#define   r2    (pp +11 * n)			/* 3n+1 */
285#define   r0    (pp +15 * n)			/* s+t <= 2*n */
286
287  ASSERT( spt <= 2 * n );
288  /******************************* interpolation *****************************/
289  if( half != 0) {
290    cy = mpn_sub_n (r4, r4, r0, spt);
291    MPN_DECR_U (r4 + spt, n3p1 - spt, cy);
292
293    cy = DO_mpn_sublsh_n (r3, r0, spt, 14, wsi);
294    MPN_DECR_U (r3 + spt, n3p1 - spt, cy);
295    DO_mpn_subrsh(r6, n3p1, r0, spt, 2, wsi);
296
297    cy = DO_mpn_sublsh_n (r2, r0, spt, 28, wsi);
298    MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
299    DO_mpn_subrsh(r5, n3p1, r0, spt, 4, wsi);
300
301    cy = DO_mpn_sublsh_n (r1 + BIT_CORRECTION, r0, spt, 42 - CORRECTION_BITS, wsi);
302    MPN_DECR_U (r1 + spt + BIT_CORRECTION, n3p1 - spt - BIT_CORRECTION, cy);
303#if BIT_CORRECTION
304    /* FIXME: assumes r7[n3p1] is writable (it is if r5 follows). */
305    cy = r7[n3p1];
306    r7[n3p1] = 0x80;
307#endif
308    DO_mpn_subrsh(r7, n3p1 + BIT_CORRECTION, r0, spt, 6, wsi);
309#if BIT_CORRECTION
310    /* FIXME: assumes r7[n3p1] is writable. */
311    ASSERT ( BIT_CORRECTION > 0 || r7[n3p1] == 0x80 );
312    r7[n3p1] = cy;
313#endif
314  };
315
316  r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 28, wsi);
317  DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 4, wsi);
318
319#if HAVE_NATIVE_mpn_add_n_sub_n
320  mpn_add_n_sub_n (r2, r5, r5, r2, n3p1);
321#else
322  mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
323  ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
324  MP_PTR_SWAP(r5, wsi);
325#endif
326
327  r6[n3] -= DO_mpn_sublsh_n (r6 + n, pp, 2 * n, 14, wsi);
328  DO_mpn_subrsh(r3 + n, 2 * n + 1, pp, 2 * n, 2, wsi);
329
330#if HAVE_NATIVE_mpn_add_n_sub_n
331  mpn_add_n_sub_n (r3, r6, r6, r3, n3p1);
332#else
333  ASSERT_NOCARRY(mpn_add_n (wsi, r3, r6, n3p1));
334  mpn_sub_n (r6, r6, r3, n3p1); /* can be negative */
335  MP_PTR_SWAP(r3, wsi);
336#endif
337
338  cy = DO_mpn_sublsh_n (r7 + n + BIT_CORRECTION, pp, 2 * n, 42 - CORRECTION_BITS, wsi);
339#if BIT_CORRECTION
340  MPN_DECR_U (r1 + n, 2 * n + 1, pp[0] >> 6);
341  cy = DO_mpn_sublsh_n (r1 + n, pp + 1, 2 * n - 1, GMP_NUMB_BITS - 6, wsi);
342  cy = mpn_sub_1(r1 + 3 * n - 1, r1 + 3 * n - 1, 2, cy);
343  ASSERT ( BIT_CORRECTION > 0 || cy != 0 );
344#else
345  r7[n3] -= cy;
346  DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 6, wsi);
347#endif
348
349#if HAVE_NATIVE_mpn_add_n_sub_n
350  mpn_add_n_sub_n (r1, r7, r7, r1, n3p1);
351#else
352  mpn_sub_n (wsi, r7, r1, n3p1); /* can be negative */
353  mpn_add_n (r1, r1, r7, n3p1);  /* if BIT_CORRECTION != 0, can give a carry. */
354  MP_PTR_SWAP(r7, wsi);
355#endif
356
357  r4[n3] -= mpn_sub_n (r4+n, r4+n, pp, 2 * n);
358
359#if AORSMUL_FASTER_2AORSLSH
360  mpn_submul_1 (r5, r6, n3p1, 1028); /* can be negative */
361#else
362  DO_mpn_sublsh_n (r5, r6, n3p1, 2, wsi); /* can be negative */
363  DO_mpn_sublsh_n (r5, r6, n3p1,10, wsi); /* can be negative */
364#endif
365
366  mpn_submul_1 (r7, r5, n3p1, 1300); /* can be negative */
367#if AORSMUL_FASTER_3AORSLSH
368  mpn_submul_1 (r7, r6, n3p1, 1052688); /* can be negative */
369#else
370  DO_mpn_sublsh_n (r7, r6, n3p1, 4, wsi); /* can be negative */
371  DO_mpn_sublsh_n (r7, r6, n3p1,12, wsi); /* can be negative */
372  DO_mpn_sublsh_n (r7, r6, n3p1,20, wsi); /* can be negative */
373#endif
374  mpn_divexact_by255x188513325(r7, r7, n3p1);
375
376  mpn_submul_1 (r5, r7, n3p1, 12567555); /* can be negative */
377  /* A division by 2835x64 followsi. Warning: the operand can be negative! */
378  mpn_divexact_by2835x64(r5, r5, n3p1);
379  if ((r5[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-7))) != 0)
380    r5[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-6));
381
382#if AORSMUL_FASTER_AORS_AORSLSH
383  mpn_submul_1 (r6, r7, n3p1, 4095); /* can be negative */
384#else
385  mpn_add_n (r6, r6, r7, n3p1); /* can give a carry */
386  DO_mpn_sublsh_n (r6, r7, n3p1, 12, wsi); /* can be negative */
387#endif
388#if AORSMUL_FASTER_2AORSLSH
389  mpn_addmul_1 (r6, r5, n3p1, 240); /* can be negative */
390#else
391  DO_mpn_addlsh_n (r6, r5, n3p1, 8, wsi); /* can give a carry */
392  DO_mpn_sublsh_n (r6, r5, n3p1, 4, wsi); /* can be negative */
393#endif
394  /* A division by 255x4 followsi. Warning: the operand can be negative! */
395  mpn_divexact_by255x4(r6, r6, n3p1);
396  if ((r6[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
397    r6[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));
398
399  ASSERT_NOCARRY(DO_mpn_sublsh_n (r3, r4, n3p1, 7, wsi));
400
401  ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r4, n3p1, 13, wsi));
402  ASSERT_NOCARRY(mpn_submul_1 (r2, r3, n3p1, 400));
403
404  /* If GMP_NUMB_BITS < 42 next operations on r1 can give a carry!*/
405  DO_mpn_sublsh_n (r1, r4, n3p1, 19, wsi);
406  mpn_submul_1 (r1, r2, n3p1, 1428);
407  mpn_submul_1 (r1, r3, n3p1, 112896);
408  mpn_divexact_by255x182712915(r1, r1, n3p1);
409
410  ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 15181425));
411  mpn_divexact_by42525x16(r2, r2, n3p1);
412
413#if AORSMUL_FASTER_AORS_2AORSLSH
414  ASSERT_NOCARRY(mpn_submul_1 (r3, r1, n3p1, 3969));
415#else
416  ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
417  ASSERT_NOCARRY(DO_mpn_addlsh_n (r3, r1, n3p1, 7, wsi));
418  ASSERT_NOCARRY(DO_mpn_sublsh_n (r3, r1, n3p1, 12, wsi));
419#endif
420  ASSERT_NOCARRY(mpn_submul_1 (r3, r2, n3p1, 900));
421  mpn_divexact_by9x16(r3, r3, n3p1);
422
423  ASSERT_NOCARRY(mpn_sub_n (r4, r4, r1, n3p1));
424  ASSERT_NOCARRY(mpn_sub_n (r4, r4, r3, n3p1));
425  ASSERT_NOCARRY(mpn_sub_n (r4, r4, r2, n3p1));
426
427  mpn_add_n (r6, r2, r6, n3p1);
428  ASSERT_NOCARRY(mpn_rshift(r6, r6, n3p1, 1));
429  ASSERT_NOCARRY(mpn_sub_n (r2, r2, r6, n3p1));
430
431  mpn_sub_n (r5, r3, r5, n3p1);
432  ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));
433  ASSERT_NOCARRY(mpn_sub_n (r3, r3, r5, n3p1));
434
435  mpn_add_n (r7, r1, r7, n3p1);
436  ASSERT_NOCARRY(mpn_rshift(r7, r7, n3p1, 1));
437  ASSERT_NOCARRY(mpn_sub_n (r1, r1, r7, n3p1));
438
439  /* last interpolation steps... */
440  /* ... could be mixed with recomposition
441	||H-r7|M-r7|L-r7|   ||H-r5|M-r5|L-r5|
442  */
443
444  /***************************** recomposition *******************************/
445  /*
446    pp[] prior to operations:
447    |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|___||H r6|M r6|L r6|____|H_r8|L r8|pp
448
449    summation scheme for remaining operations:
450    |__16|n_15|n_14|n_13|n_12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp
451    |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|___||H r6|M r6|L r6|____|H_r8|L r8|pp
452	||H r1|M r1|L r1|   ||H r3|M r3|L r3|   ||H_r5|M_r5|L_r5|   ||H r7|M r7|L r7|
453  */
454
455  cy = mpn_add_n (pp + n, pp + n, r7, n);
456  cy = mpn_add_1 (pp + 2 * n, r7 + n, n, cy);
457#if HAVE_NATIVE_mpn_add_nc
458  cy = r7[n3] + mpn_add_nc(pp + n3, pp + n3, r7 + 2 * n, n, cy);
459#else
460  MPN_INCR_U (r7 + 2 * n, n + 1, cy);
461  cy = r7[n3] + mpn_add_n (pp + n3, pp + n3, r7 + 2 * n, n);
462#endif
463  MPN_INCR_U (pp + 4 * n, 2 * n + 1, cy);
464
465  pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r5, n);
466  cy = mpn_add_1 (pp + 2 * n3, r5 + n, n, pp[2 * n3]);
467#if HAVE_NATIVE_mpn_add_nc
468  cy = r5[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r5 + 2 * n, n, cy);
469#else
470  MPN_INCR_U (r5 + 2 * n, n + 1, cy);
471  cy = r5[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r5 + 2 * n, n);
472#endif
473  MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);
474
475  pp[10 * n]+= mpn_add_n (pp + 9 * n, pp + 9 * n, r3, n);
476  cy = mpn_add_1 (pp + 10 * n, r3 + n, n, pp[10 * n]);
477#if HAVE_NATIVE_mpn_add_nc
478  cy = r3[n3] + mpn_add_nc(pp +11 * n, pp +11 * n, r3 + 2 * n, n, cy);
479#else
480  MPN_INCR_U (r3 + 2 * n, n + 1, cy);
481  cy = r3[n3] + mpn_add_n (pp +11 * n, pp +11 * n, r3 + 2 * n, n);
482#endif
483  MPN_INCR_U (pp +12 * n, 2 * n + 1, cy);
484
485  pp[14 * n]+=mpn_add_n (pp +13 * n, pp +13 * n, r1, n);
486  if ( half ) {
487    cy = mpn_add_1 (pp + 14 * n, r1 + n, n, pp[14 * n]);
488#if HAVE_NATIVE_mpn_add_nc
489    if(LIKELY(spt > n)) {
490      cy = r1[n3] + mpn_add_nc(pp + 15 * n, pp + 15 * n, r1 + 2 * n, n, cy);
491      MPN_INCR_U (pp + 16 * n, spt - n, cy);
492    } else {
493      ASSERT_NOCARRY(mpn_add_nc(pp + 15 * n, pp + 15 * n, r1 + 2 * n, spt, cy));
494    }
495#else
496    MPN_INCR_U (r1 + 2 * n, n + 1, cy);
497    if(LIKELY(spt > n)) {
498      cy = r1[n3] + mpn_add_n (pp + 15 * n, pp + 15 * n, r1 + 2 * n, n);
499      MPN_INCR_U (pp + 16 * n, spt - n, cy);
500    } else {
501      ASSERT_NOCARRY(mpn_add_n (pp + 15 * n, pp + 15 * n, r1 + 2 * n, spt));
502    }
503#endif
504  } else {
505    ASSERT_NOCARRY(mpn_add_1 (pp + 14 * n, r1 + n, spt, pp[14 * n]));
506  }
507
508#undef   r0
509#undef   r2
510#undef   r4
511#undef   r6
512}
513