1/* mpn_sqr_basecase -- Internal routine to square a natural number
2   of length n.
3
4   THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS ONLY
5   SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
6
7
8Copyright 1991-1994, 1996, 1997, 2000-2005, 2008, 2010, 2011, 2017 Free
9Software Foundation, Inc.
10
11This file is part of the GNU MP Library.
12
13The GNU MP Library is free software; you can redistribute it and/or modify
14it under the terms of either:
15
16  * the GNU Lesser General Public License as published by the Free
17    Software Foundation; either version 3 of the License, or (at your
18    option) any later version.
19
20or
21
22  * the GNU General Public License as published by the Free Software
23    Foundation; either version 2 of the License, or (at your option) any
24    later version.
25
26or both in parallel, as here.
27
28The GNU MP Library is distributed in the hope that it will be useful, but
29WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
30or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
31for more details.
32
33You should have received copies of the GNU General Public License and the
34GNU Lesser General Public License along with the GNU MP Library.  If not,
35see https://www.gnu.org/licenses/.  */
36
37#include "gmp-impl.h"
38#include "longlong.h"
39
40
41#if HAVE_NATIVE_mpn_sqr_diagonal
42#define MPN_SQR_DIAGONAL(rp, up, n)					\
43  mpn_sqr_diagonal (rp, up, n)
44#else
45#define MPN_SQR_DIAGONAL(rp, up, n)					\
46  do {									\
47    mp_size_t _i;							\
48    for (_i = 0; _i < (n); _i++)					\
49      {									\
50	mp_limb_t ul, lpl;						\
51	ul = (up)[_i];							\
52	umul_ppmm ((rp)[2 * _i + 1], lpl, ul, ul << GMP_NAIL_BITS);	\
53	(rp)[2 * _i] = lpl >> GMP_NAIL_BITS;				\
54      }									\
55  } while (0)
56#endif
57
58#if HAVE_NATIVE_mpn_sqr_diag_addlsh1
59#define MPN_SQR_DIAG_ADDLSH1(rp, tp, up, n)				\
60  mpn_sqr_diag_addlsh1 (rp, tp, up, n)
61#else
62#if HAVE_NATIVE_mpn_addlsh1_n
63#define MPN_SQR_DIAG_ADDLSH1(rp, tp, up, n)				\
64  do {									\
65    mp_limb_t cy;							\
66    MPN_SQR_DIAGONAL (rp, up, n);					\
67    cy = mpn_addlsh1_n (rp + 1, rp + 1, tp, 2 * n - 2);			\
68    rp[2 * n - 1] += cy;						\
69  } while (0)
70#else
71#define MPN_SQR_DIAG_ADDLSH1(rp, tp, up, n)				\
72  do {									\
73    mp_limb_t cy;							\
74    MPN_SQR_DIAGONAL (rp, up, n);					\
75    cy = mpn_lshift (tp, tp, 2 * n - 2, 1);				\
76    cy += mpn_add_n (rp + 1, rp + 1, tp, 2 * n - 2);			\
77    rp[2 * n - 1] += cy;						\
78  } while (0)
79#endif
80#endif
81
82
83#undef READY_WITH_mpn_sqr_basecase
84
85
86#if ! defined (READY_WITH_mpn_sqr_basecase) && HAVE_NATIVE_mpn_addmul_2s
87void
88mpn_sqr_basecase (mp_ptr rp, mp_srcptr up, mp_size_t n)
89{
90  mp_size_t i;
91  mp_limb_t tarr[2 * SQR_TOOM2_THRESHOLD];
92  mp_ptr tp = tarr;
93  mp_limb_t cy;
94
95  /* must fit 2*n limbs in tarr */
96  ASSERT (n <= SQR_TOOM2_THRESHOLD);
97
98  if ((n & 1) != 0)
99    {
100      if (n == 1)
101	{
102	  mp_limb_t ul, lpl;
103	  ul = up[0];
104	  umul_ppmm (rp[1], lpl, ul, ul << GMP_NAIL_BITS);
105	  rp[0] = lpl >> GMP_NAIL_BITS;
106	  return;
107	}
108
109      MPN_ZERO (tp, n);
110
111      for (i = 0; i <= n - 2; i += 2)
112	{
113	  cy = mpn_addmul_2s (tp + 2 * i, up + i + 1, n - (i + 1), up + i);
114	  tp[n + i] = cy;
115	}
116    }
117  else
118    {
119      if (n == 2)
120	{
121#if HAVE_NATIVE_mpn_mul_2
122	  rp[3] = mpn_mul_2 (rp, up, 2, up);
123#else
124	  rp[0] = 0;
125	  rp[1] = 0;
126	  rp[3] = mpn_addmul_2 (rp, up, 2, up);
127#endif
128	  return;
129	}
130
131      MPN_ZERO (tp, n);
132
133      for (i = 0; i <= n - 4; i += 2)
134	{
135	  cy = mpn_addmul_2s (tp + 2 * i, up + i + 1, n - (i + 1), up + i);
136	  tp[n + i] = cy;
137	}
138      cy = mpn_addmul_1 (tp + 2 * n - 4, up + n - 1, 1, up[n - 2]);
139      tp[2 * n - 3] = cy;
140    }
141
142  MPN_SQR_DIAG_ADDLSH1 (rp, tp, up, n);
143}
144#define READY_WITH_mpn_sqr_basecase
145#endif
146
147
148#if ! defined (READY_WITH_mpn_sqr_basecase) && HAVE_NATIVE_mpn_addmul_2
149
150/* mpn_sqr_basecase using plain mpn_addmul_2.
151
152   This is tricky, since we have to let mpn_addmul_2 make some undesirable
153   multiplies, u[k]*u[k], that we would like to let mpn_sqr_diagonal handle.
154   This forces us to conditionally add or subtract the mpn_sqr_diagonal
155   results.  Examples of the product we form:
156
157   n = 4              n = 5		n = 6
158   u1u0 * u3u2u1      u1u0 * u4u3u2u1	u1u0 * u5u4u3u2u1
159   u2 * u3	      u3u2 * u4u3	u3u2 * u5u4u3
160					u4 * u5
161   add: u0 u2 u3      add: u0 u2 u4	add: u0 u2 u4 u5
162   sub: u1	      sub: u1 u3	sub: u1 u3
163*/
164
165void
166mpn_sqr_basecase (mp_ptr rp, mp_srcptr up, mp_size_t n)
167{
168  mp_size_t i;
169  mp_limb_t tarr[2 * SQR_TOOM2_THRESHOLD];
170  mp_ptr tp = tarr;
171  mp_limb_t cy;
172
173  /* must fit 2*n limbs in tarr */
174  ASSERT (n <= SQR_TOOM2_THRESHOLD);
175
176  if ((n & 1) != 0)
177    {
178      mp_limb_t x0, x1;
179
180      if (n == 1)
181	{
182	  mp_limb_t ul, lpl;
183	  ul = up[0];
184	  umul_ppmm (rp[1], lpl, ul, ul << GMP_NAIL_BITS);
185	  rp[0] = lpl >> GMP_NAIL_BITS;
186	  return;
187	}
188
189      /* The code below doesn't like unnormalized operands.  Since such
190	 operands are unusual, handle them with a dumb recursion.  */
191      if (up[n - 1] == 0)
192	{
193	  rp[2 * n - 2] = 0;
194	  rp[2 * n - 1] = 0;
195	  mpn_sqr_basecase (rp, up, n - 1);
196	  return;
197	}
198
199      MPN_ZERO (tp, n);
200
201      for (i = 0; i <= n - 2; i += 2)
202	{
203	  cy = mpn_addmul_2 (tp + 2 * i, up + i + 1, n - (i + 1), up + i);
204	  tp[n + i] = cy;
205	}
206
207      MPN_SQR_DIAGONAL (rp, up, n);
208
209      for (i = 2;; i += 4)
210	{
211	  x0 = rp[i + 0];
212	  rp[i + 0] = (-x0) & GMP_NUMB_MASK;
213	  x1 = rp[i + 1];
214	  rp[i + 1] = (-x1 - (x0 != 0)) & GMP_NUMB_MASK;
215	  __GMPN_SUB_1 (cy, rp + i + 2, rp + i + 2, 2, (x1 | x0) != 0);
216	  if (i + 4 >= 2 * n)
217	    break;
218	  mpn_incr_u (rp + i + 4, cy);
219	}
220    }
221  else
222    {
223      mp_limb_t x0, x1;
224
225      if (n == 2)
226	{
227#if HAVE_NATIVE_mpn_mul_2
228	  rp[3] = mpn_mul_2 (rp, up, 2, up);
229#else
230	  rp[0] = 0;
231	  rp[1] = 0;
232	  rp[3] = mpn_addmul_2 (rp, up, 2, up);
233#endif
234	  return;
235	}
236
237      /* The code below doesn't like unnormalized operands.  Since such
238	 operands are unusual, handle them with a dumb recursion.  */
239      if (up[n - 1] == 0)
240	{
241	  rp[2 * n - 2] = 0;
242	  rp[2 * n - 1] = 0;
243	  mpn_sqr_basecase (rp, up, n - 1);
244	  return;
245	}
246
247      MPN_ZERO (tp, n);
248
249      for (i = 0; i <= n - 4; i += 2)
250	{
251	  cy = mpn_addmul_2 (tp + 2 * i, up + i + 1, n - (i + 1), up + i);
252	  tp[n + i] = cy;
253	}
254      cy = mpn_addmul_1 (tp + 2 * n - 4, up + n - 1, 1, up[n - 2]);
255      tp[2 * n - 3] = cy;
256
257      MPN_SQR_DIAGONAL (rp, up, n);
258
259      for (i = 2;; i += 4)
260	{
261	  x0 = rp[i + 0];
262	  rp[i + 0] = (-x0) & GMP_NUMB_MASK;
263	  x1 = rp[i + 1];
264	  rp[i + 1] = (-x1 - (x0 != 0)) & GMP_NUMB_MASK;
265	  if (i + 6 >= 2 * n)
266	    break;
267	  __GMPN_SUB_1 (cy, rp + i + 2, rp + i + 2, 2, (x1 | x0) != 0);
268	  mpn_incr_u (rp + i + 4, cy);
269	}
270      mpn_decr_u (rp + i + 2, (x1 | x0) != 0);
271    }
272
273#if HAVE_NATIVE_mpn_addlsh1_n
274  cy = mpn_addlsh1_n (rp + 1, rp + 1, tp, 2 * n - 2);
275#else
276  cy = mpn_lshift (tp, tp, 2 * n - 2, 1);
277  cy += mpn_add_n (rp + 1, rp + 1, tp, 2 * n - 2);
278#endif
279  rp[2 * n - 1] += cy;
280}
281#define READY_WITH_mpn_sqr_basecase
282#endif
283
284
285#if ! defined (READY_WITH_mpn_sqr_basecase) && HAVE_NATIVE_mpn_sqr_diag_addlsh1
286
287/* mpn_sqr_basecase using mpn_addmul_1 and mpn_sqr_diag_addlsh1, avoiding stack
288   allocation.  */
289void
290mpn_sqr_basecase (mp_ptr rp, mp_srcptr up, mp_size_t n)
291{
292  if (n == 1)
293    {
294      mp_limb_t ul, lpl;
295      ul = up[0];
296      umul_ppmm (rp[1], lpl, ul, ul << GMP_NAIL_BITS);
297      rp[0] = lpl >> GMP_NAIL_BITS;
298    }
299  else
300    {
301      mp_size_t i;
302      mp_ptr xp;
303
304      rp += 1;
305      rp[n - 1] = mpn_mul_1 (rp, up + 1, n - 1, up[0]);
306      for (i = n - 2; i != 0; i--)
307	{
308	  up += 1;
309	  rp += 2;
310	  rp[i] = mpn_addmul_1 (rp, up + 1, i, up[0]);
311	}
312
313      xp = rp - 2 * n + 3;
314      mpn_sqr_diag_addlsh1 (xp, xp + 1, up - n + 2, n);
315    }
316}
317#define READY_WITH_mpn_sqr_basecase
318#endif
319
320
321#if ! defined (READY_WITH_mpn_sqr_basecase)
322
323/* Default mpn_sqr_basecase using mpn_addmul_1.  */
324void
325mpn_sqr_basecase (mp_ptr rp, mp_srcptr up, mp_size_t n)
326{
327  mp_size_t i;
328
329  ASSERT (n >= 1);
330  ASSERT (! MPN_OVERLAP_P (rp, 2*n, up, n));
331
332  if (n == 1)
333    {
334      mp_limb_t ul, lpl;
335      ul = up[0];
336      umul_ppmm (rp[1], lpl, ul, ul << GMP_NAIL_BITS);
337      rp[0] = lpl >> GMP_NAIL_BITS;
338    }
339  else
340    {
341      mp_limb_t tarr[2 * SQR_TOOM2_THRESHOLD];
342      mp_ptr tp = tarr;
343      mp_limb_t cy;
344
345      /* must fit 2*n limbs in tarr */
346      ASSERT (n <= SQR_TOOM2_THRESHOLD);
347
348      cy = mpn_mul_1 (tp, up + 1, n - 1, up[0]);
349      tp[n - 1] = cy;
350      for (i = 2; i < n; i++)
351	{
352	  mp_limb_t cy;
353	  cy = mpn_addmul_1 (tp + 2 * i - 2, up + i, n - i, up[i - 1]);
354	  tp[n + i - 2] = cy;
355	}
356
357      MPN_SQR_DIAG_ADDLSH1 (rp, tp, up, n);
358    }
359}
360#define READY_WITH_mpn_sqr_basecase
361#endif
362