bignum_mul_4_8_alt.S revision 1.3
1// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2//
3// Permission to use, copy, modify, and/or distribute this software for any
4// purpose with or without fee is hereby granted, provided that the above
5// copyright notice and this permission notice appear in all copies.
6//
7// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
10// ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
12// ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
13// OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
14
15// ----------------------------------------------------------------------------
16// Multiply z := x * y
17// Inputs x[4], y[4]; output z[8]
18//
19//    extern void bignum_mul_4_8_alt
20//      (uint64_t z[static 8], uint64_t x[static 4], uint64_t y[static 4]);
21//
22// Standard x86-64 ABI: RDI = z, RSI = x, RDX = y
23// Microsoft x64 ABI:   RCX = z, RDX = x, R8 = y
24// ----------------------------------------------------------------------------
25
26#include "s2n_bignum_internal.h"
27
28        .intel_syntax noprefix
29        S2N_BN_SYM_VISIBILITY_DIRECTIVE(bignum_mul_4_8_alt)
30        S2N_BN_SYM_PRIVACY_DIRECTIVE(bignum_mul_4_8_alt)
31        .text
32
33// These are actually right
34
35#define z rdi
36#define x rsi
37
38// This is moved from rdx to free it for muls
39
40#define y rcx
41
42// Other variables used as a rotating 3-word window to add terms to
43
44#define t0 r8
45#define t1 r9
46#define t2 r10
47
48// Macro for the key "multiply and add to (c,h,l)" step
49
50#define combadd(c,h,l,numa,numb)                \
51        mov     rax, numa;                      \
52        mul     QWORD PTR numb;                 \
53        add     l, rax;                         \
54        adc     h, rdx;                         \
55        adc     c, 0
56
57// A minutely shorter form for when c = 0 initially
58
59#define combadz(c,h,l,numa,numb)                \
60        mov     rax, numa;                      \
61        mul     QWORD PTR numb;                 \
62        add     l, rax;                         \
63        adc     h, rdx;                         \
64        adc     c, c
65
66// A short form where we don't expect a top carry
67
68#define combads(h,l,numa,numb)                  \
69        mov     rax, numa;                      \
70        mul     QWORD PTR numb;                 \
71        add     l, rax;                         \
72        adc     h, rdx
73
74S2N_BN_SYMBOL(bignum_mul_4_8_alt):
75
76#if WINDOWS_ABI
77        push    rdi
78        push    rsi
79        mov     rdi, rcx
80        mov     rsi, rdx
81        mov     rdx, r8
82#endif
83
84// Copy y into a safe register to start with
85
86        mov     y, rdx
87
88// Result term 0
89
90        mov     rax, [x]
91        mul     QWORD PTR [y]
92
93        mov     [z], rax
94        mov     t0, rdx
95        xor     t1, t1
96
97// Result term 1
98
99        xor     t2, t2
100        combads(t1,t0,[x],[y+8])
101        combadz(t2,t1,t0,[x+8],[y])
102        mov     [z+8], t0
103
104// Result term 2
105
106        xor     t0, t0
107        combadz(t0,t2,t1,[x],[y+16])
108        combadd(t0,t2,t1,[x+8],[y+8])
109        combadd(t0,t2,t1,[x+16],[y])
110        mov     [z+16], t1
111
112// Result term 3
113
114        xor     t1, t1
115        combadz(t1,t0,t2,[x],[y+24])
116        combadd(t1,t0,t2,[x+8],[y+16])
117        combadd(t1,t0,t2,[x+16],[y+8])
118        combadd(t1,t0,t2,[x+24],[y])
119        mov     [z+24], t2
120
121// Result term 4
122
123        xor     t2, t2
124        combadz(t2,t1,t0,[x+8],[y+24])
125        combadd(t2,t1,t0,[x+16],[y+16])
126        combadd(t2,t1,t0,[x+24],[y+8])
127        mov     [z+32], t0
128
129// Result term 5
130
131        xor     t0, t0
132        combadz(t0,t2,t1,[x+16],[y+24])
133        combadd(t0,t2,t1,[x+24],[y+16])
134        mov     [z+40], t1
135
136// Result term 6
137
138        xor     t1, t1
139        combads(t0,t2,[x+24],[y+24])
140        mov     [z+48], t2
141
142// Result term 7
143
144        mov     [z+56], t0
145
146// Return
147
148#if WINDOWS_ABI
149        pop    rsi
150        pop    rdi
151#endif
152        ret
153
154#if defined(__linux__) && defined(__ELF__)
155.section .note.GNU-stack,"",%progbits
156#endif
157