1/* mpn_add_n_sub_n -- Add and Subtract two limb vectors of equal, non-zero length.
2
3   THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
4   SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
5   GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
6
7Copyright 1999-2001, 2006 Free Software Foundation, Inc.
8
9This file is part of the GNU MP Library.
10
11The GNU MP Library is free software; you can redistribute it and/or modify
12it under the terms of either:
13
14  * the GNU Lesser General Public License as published by the Free
15    Software Foundation; either version 3 of the License, or (at your
16    option) any later version.
17
18or
19
20  * the GNU General Public License as published by the Free Software
21    Foundation; either version 2 of the License, or (at your option) any
22    later version.
23
24or both in parallel, as here.
25
26The GNU MP Library is distributed in the hope that it will be useful, but
27WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
28or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
29for more details.
30
31You should have received copies of the GNU General Public License and the
32GNU Lesser General Public License along with the GNU MP Library.  If not,
33see https://www.gnu.org/licenses/.  */
34
35#include "gmp-impl.h"
36
37#ifndef L1_CACHE_SIZE
38#define L1_CACHE_SIZE 8192	/* only 68040 has less than this */
39#endif
40
41#define PART_SIZE (L1_CACHE_SIZE / GMP_LIMB_BYTES / 6)
42
43
44/* mpn_add_n_sub_n.
45   r1[] = s1[] + s2[]
46   r2[] = s1[] - s2[]
47   All operands have n limbs.
48   In-place operations allowed.  */
49mp_limb_t
50mpn_add_n_sub_n (mp_ptr r1p, mp_ptr r2p, mp_srcptr s1p, mp_srcptr s2p, mp_size_t n)
51{
52  mp_limb_t acyn, acyo;		/* carry for add */
53  mp_limb_t scyn, scyo;		/* carry for subtract */
54  mp_size_t off;		/* offset in operands */
55  mp_size_t this_n;		/* size of current chunk */
56
57  /* We alternatingly add and subtract in chunks that fit into the (L1)
58     cache.  Since the chunks are several hundred limbs, the function call
59     overhead is insignificant, but we get much better locality.  */
60
61  /* We have three variant of the inner loop, the proper loop is chosen
62     depending on whether r1 or r2 are the same operand as s1 or s2.  */
63
64  if (r1p != s1p && r1p != s2p)
65    {
66      /* r1 is not identical to either input operand.  We can therefore write
67	 to r1 directly, without using temporary storage.  */
68      acyo = 0;
69      scyo = 0;
70      for (off = 0; off < n; off += PART_SIZE)
71	{
72	  this_n = MIN (n - off, PART_SIZE);
73#if HAVE_NATIVE_mpn_add_nc
74	  acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo);
75#else
76	  acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n);
77	  acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo);
78#endif
79#if HAVE_NATIVE_mpn_sub_nc
80	  scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
81#else
82	  scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
83	  scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
84#endif
85	}
86    }
87  else if (r2p != s1p && r2p != s2p)
88    {
89      /* r2 is not identical to either input operand.  We can therefore write
90	 to r2 directly, without using temporary storage.  */
91      acyo = 0;
92      scyo = 0;
93      for (off = 0; off < n; off += PART_SIZE)
94	{
95	  this_n = MIN (n - off, PART_SIZE);
96#if HAVE_NATIVE_mpn_sub_nc
97	  scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
98#else
99	  scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
100	  scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
101#endif
102#if HAVE_NATIVE_mpn_add_nc
103	  acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo);
104#else
105	  acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n);
106	  acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo);
107#endif
108	}
109    }
110  else
111    {
112      /* r1 and r2 are identical to s1 and s2 (r1==s1 and r2==s2 or vice versa)
113	 Need temporary storage.  */
114      mp_limb_t tp[PART_SIZE];
115      acyo = 0;
116      scyo = 0;
117      for (off = 0; off < n; off += PART_SIZE)
118	{
119	  this_n = MIN (n - off, PART_SIZE);
120#if HAVE_NATIVE_mpn_add_nc
121	  acyo = mpn_add_nc (tp, s1p + off, s2p + off, this_n, acyo);
122#else
123	  acyn = mpn_add_n (tp, s1p + off, s2p + off, this_n);
124	  acyo = acyn + mpn_add_1 (tp, tp, this_n, acyo);
125#endif
126#if HAVE_NATIVE_mpn_sub_nc
127	  scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo);
128#else
129	  scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n);
130	  scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo);
131#endif
132	  MPN_COPY (r1p + off, tp, this_n);
133	}
134    }
135
136  return 2 * acyo + scyo;
137}
138
139#ifdef MAIN
140#include <stdlib.h>
141#include <stdio.h>
142#include "timing.h"
143
144long cputime ();
145
146int
147main (int argc, char **argv)
148{
149  mp_ptr r1p, r2p, s1p, s2p;
150  double t;
151  mp_size_t n;
152
153  n = strtol (argv[1], 0, 0);
154
155  r1p = malloc (n * GMP_LIMB_BYTES);
156  r2p = malloc (n * GMP_LIMB_BYTES);
157  s1p = malloc (n * GMP_LIMB_BYTES);
158  s2p = malloc (n * GMP_LIMB_BYTES);
159  TIME (t,(mpn_add_n(r1p,s1p,s2p,n),mpn_sub_n(r1p,s1p,s2p,n)));
160  printf ("              separate add and sub: %.3f\n", t);
161  TIME (t,mpn_add_n_sub_n(r1p,r2p,s1p,s2p,n));
162  printf ("combined addsub separate variables: %.3f\n", t);
163  TIME (t,mpn_add_n_sub_n(r1p,r2p,r1p,s2p,n));
164  printf ("        combined addsub r1 overlap: %.3f\n", t);
165  TIME (t,mpn_add_n_sub_n(r1p,r2p,r1p,s2p,n));
166  printf ("        combined addsub r2 overlap: %.3f\n", t);
167  TIME (t,mpn_add_n_sub_n(r1p,r2p,r1p,r2p,n));
168  printf ("          combined addsub in-place: %.3f\n", t);
169
170  return 0;
171}
172#endif
173