1/* mpn_add_n_sub_n -- Add and Subtract two limb vectors of equal, non-zero length. 2 3 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY 4 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 5 GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 6 7Copyright 1999-2001, 2006 Free Software Foundation, Inc. 8 9This file is part of the GNU MP Library. 10 11The GNU MP Library is free software; you can redistribute it and/or modify 12it under the terms of either: 13 14 * the GNU Lesser General Public License as published by the Free 15 Software Foundation; either version 3 of the License, or (at your 16 option) any later version. 17 18or 19 20 * the GNU General Public License as published by the Free Software 21 Foundation; either version 2 of the License, or (at your option) any 22 later version. 23 24or both in parallel, as here. 25 26The GNU MP Library is distributed in the hope that it will be useful, but 27WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 28or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 29for more details. 30 31You should have received copies of the GNU General Public License and the 32GNU Lesser General Public License along with the GNU MP Library. If not, 33see https://www.gnu.org/licenses/. */ 34 35#include "gmp-impl.h" 36 37#ifndef L1_CACHE_SIZE 38#define L1_CACHE_SIZE 8192 /* only 68040 has less than this */ 39#endif 40 41#define PART_SIZE (L1_CACHE_SIZE / GMP_LIMB_BYTES / 6) 42 43 44/* mpn_add_n_sub_n. 45 r1[] = s1[] + s2[] 46 r2[] = s1[] - s2[] 47 All operands have n limbs. 48 In-place operations allowed. */ 49mp_limb_t 50mpn_add_n_sub_n (mp_ptr r1p, mp_ptr r2p, mp_srcptr s1p, mp_srcptr s2p, mp_size_t n) 51{ 52 mp_limb_t acyn, acyo; /* carry for add */ 53 mp_limb_t scyn, scyo; /* carry for subtract */ 54 mp_size_t off; /* offset in operands */ 55 mp_size_t this_n; /* size of current chunk */ 56 57 /* We alternatingly add and subtract in chunks that fit into the (L1) 58 cache. Since the chunks are several hundred limbs, the function call 59 overhead is insignificant, but we get much better locality. */ 60 61 /* We have three variant of the inner loop, the proper loop is chosen 62 depending on whether r1 or r2 are the same operand as s1 or s2. */ 63 64 if (r1p != s1p && r1p != s2p) 65 { 66 /* r1 is not identical to either input operand. We can therefore write 67 to r1 directly, without using temporary storage. */ 68 acyo = 0; 69 scyo = 0; 70 for (off = 0; off < n; off += PART_SIZE) 71 { 72 this_n = MIN (n - off, PART_SIZE); 73#if HAVE_NATIVE_mpn_add_nc 74 acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo); 75#else 76 acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n); 77 acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo); 78#endif 79#if HAVE_NATIVE_mpn_sub_nc 80 scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo); 81#else 82 scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n); 83 scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo); 84#endif 85 } 86 } 87 else if (r2p != s1p && r2p != s2p) 88 { 89 /* r2 is not identical to either input operand. We can therefore write 90 to r2 directly, without using temporary storage. */ 91 acyo = 0; 92 scyo = 0; 93 for (off = 0; off < n; off += PART_SIZE) 94 { 95 this_n = MIN (n - off, PART_SIZE); 96#if HAVE_NATIVE_mpn_sub_nc 97 scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo); 98#else 99 scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n); 100 scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo); 101#endif 102#if HAVE_NATIVE_mpn_add_nc 103 acyo = mpn_add_nc (r1p + off, s1p + off, s2p + off, this_n, acyo); 104#else 105 acyn = mpn_add_n (r1p + off, s1p + off, s2p + off, this_n); 106 acyo = acyn + mpn_add_1 (r1p + off, r1p + off, this_n, acyo); 107#endif 108 } 109 } 110 else 111 { 112 /* r1 and r2 are identical to s1 and s2 (r1==s1 and r2==s2 or vice versa) 113 Need temporary storage. */ 114 mp_limb_t tp[PART_SIZE]; 115 acyo = 0; 116 scyo = 0; 117 for (off = 0; off < n; off += PART_SIZE) 118 { 119 this_n = MIN (n - off, PART_SIZE); 120#if HAVE_NATIVE_mpn_add_nc 121 acyo = mpn_add_nc (tp, s1p + off, s2p + off, this_n, acyo); 122#else 123 acyn = mpn_add_n (tp, s1p + off, s2p + off, this_n); 124 acyo = acyn + mpn_add_1 (tp, tp, this_n, acyo); 125#endif 126#if HAVE_NATIVE_mpn_sub_nc 127 scyo = mpn_sub_nc (r2p + off, s1p + off, s2p + off, this_n, scyo); 128#else 129 scyn = mpn_sub_n (r2p + off, s1p + off, s2p + off, this_n); 130 scyo = scyn + mpn_sub_1 (r2p + off, r2p + off, this_n, scyo); 131#endif 132 MPN_COPY (r1p + off, tp, this_n); 133 } 134 } 135 136 return 2 * acyo + scyo; 137} 138 139#ifdef MAIN 140#include <stdlib.h> 141#include <stdio.h> 142#include "timing.h" 143 144long cputime (); 145 146int 147main (int argc, char **argv) 148{ 149 mp_ptr r1p, r2p, s1p, s2p; 150 double t; 151 mp_size_t n; 152 153 n = strtol (argv[1], 0, 0); 154 155 r1p = malloc (n * GMP_LIMB_BYTES); 156 r2p = malloc (n * GMP_LIMB_BYTES); 157 s1p = malloc (n * GMP_LIMB_BYTES); 158 s2p = malloc (n * GMP_LIMB_BYTES); 159 TIME (t,(mpn_add_n(r1p,s1p,s2p,n),mpn_sub_n(r1p,s1p,s2p,n))); 160 printf (" separate add and sub: %.3f\n", t); 161 TIME (t,mpn_add_n_sub_n(r1p,r2p,s1p,s2p,n)); 162 printf ("combined addsub separate variables: %.3f\n", t); 163 TIME (t,mpn_add_n_sub_n(r1p,r2p,r1p,s2p,n)); 164 printf (" combined addsub r1 overlap: %.3f\n", t); 165 TIME (t,mpn_add_n_sub_n(r1p,r2p,r1p,s2p,n)); 166 printf (" combined addsub r2 overlap: %.3f\n", t); 167 TIME (t,mpn_add_n_sub_n(r1p,r2p,r1p,r2p,n)); 168 printf (" combined addsub in-place: %.3f\n", t); 169 170 return 0; 171} 172#endif 173