1/*
2 * Argon2 source code package
3 *
4 * Written by Daniel Dinu and Dmitry Khovratovich, 2015
5 *
6 * This work is licensed under a Creative Commons CC0 1.0 License/Waiver.
7 *
8 * You should have received a copy of the CC0 Public Domain Dedication along
9 * with
10 * this software. If not, see
11 * <http://creativecommons.org/publicdomain/zero/1.0/>.
12 */
13
14#include <stdint.h>
15#include <stdlib.h>
16#include <string.h>
17
18#include "argon2-core.h"
19#include "argon2.h"
20#include "private/common.h"
21#include "private/sse2_64_32.h"
22
23#if defined(HAVE_AVX2INTRIN_H) && defined(HAVE_EMMINTRIN_H) && \
24    defined(HAVE_TMMINTRIN_H) && defined(HAVE_SMMINTRIN_H)
25
26# ifdef __GNUC__
27#  pragma GCC target("sse2")
28#  pragma GCC target("ssse3")
29#  pragma GCC target("sse4.1")
30#  pragma GCC target("avx2")
31# endif
32
33# ifdef _MSC_VER
34#  include <intrin.h> /* for _mm_set_epi64x */
35# endif
36#include <emmintrin.h>
37#include <immintrin.h>
38#include <smmintrin.h>
39#include <tmmintrin.h>
40
41# include "blamka-round-avx2.h"
42
43static void
44fill_block(__m256i *state, const uint8_t *ref_block, uint8_t *next_block)
45{
46    __m256i  block_XY[ARGON2_HWORDS_IN_BLOCK];
47    uint32_t i;
48
49    for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
50        block_XY[i] = state[i] = _mm256_xor_si256(
51            state[i], _mm256_loadu_si256((__m256i const *) (&ref_block[32 * i])));
52    }
53
54    for (i = 0; i < 4; ++i) {
55        BLAKE2_ROUND_1(state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5],
56                       state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]);
57    }
58
59    for (i = 0; i < 4; ++i) {
60        BLAKE2_ROUND_2(state[ 0 + i], state[ 4 + i], state[ 8 + i], state[12 + i],
61                       state[16 + i], state[20 + i], state[24 + i], state[28 + i]);
62    }
63
64    for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
65        state[i] = _mm256_xor_si256(state[i], block_XY[i]);
66        _mm256_storeu_si256((__m256i *) (&next_block[32 * i]), state[i]);
67    }
68}
69
70static void
71fill_block_with_xor(__m256i *state, const uint8_t *ref_block,
72                    uint8_t *next_block)
73{
74    __m256i  block_XY[ARGON2_HWORDS_IN_BLOCK];
75    uint32_t i;
76
77    for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
78        state[i] = _mm256_xor_si256(
79            state[i], _mm256_loadu_si256((__m256i const *) (&ref_block[32 * i])));
80        block_XY[i] = _mm256_xor_si256(
81            state[i], _mm256_loadu_si256((__m256i const *) (&next_block[32 * i])));
82    }
83
84    for (i = 0; i < 4; ++i) {
85        BLAKE2_ROUND_1(state[8 * i + 0], state[8 * i + 4], state[8 * i + 1], state[8 * i + 5],
86                       state[8 * i + 2], state[8 * i + 6], state[8 * i + 3], state[8 * i + 7]);
87    }
88
89    for (i = 0; i < 4; ++i) {
90        BLAKE2_ROUND_2(state[ 0 + i], state[ 4 + i], state[ 8 + i], state[12 + i],
91                       state[16 + i], state[20 + i], state[24 + i], state[28 + i]);
92    }
93
94    for (i = 0; i < ARGON2_HWORDS_IN_BLOCK; i++) {
95        state[i] = _mm256_xor_si256(state[i], block_XY[i]);
96        _mm256_storeu_si256((__m256i *) (&next_block[32 * i]), state[i]);
97    }
98}
99
100static void
101generate_addresses(const argon2_instance_t *instance,
102                   const argon2_position_t *position, uint64_t *pseudo_rands)
103{
104    block    address_block, input_block, tmp_block;
105    uint32_t i;
106
107    init_block_value(&address_block, 0);
108    init_block_value(&input_block, 0);
109
110    if (instance != NULL && position != NULL) {
111        input_block.v[0] = position->pass;
112        input_block.v[1] = position->lane;
113        input_block.v[2] = position->slice;
114        input_block.v[3] = instance->memory_blocks;
115        input_block.v[4] = instance->passes;
116        input_block.v[5] = instance->type;
117
118        for (i = 0; i < instance->segment_length; ++i) {
119            if (i % ARGON2_ADDRESSES_IN_BLOCK == 0) {
120                /* Temporary zero-initialized blocks */
121                __m256i zero_block[ARGON2_HWORDS_IN_BLOCK];
122                __m256i zero2_block[ARGON2_HWORDS_IN_BLOCK];
123
124                memset(zero_block, 0, sizeof(zero_block));
125                memset(zero2_block, 0, sizeof(zero2_block));
126                init_block_value(&address_block, 0);
127                init_block_value(&tmp_block, 0);
128                /* Increasing index counter */
129                input_block.v[6]++;
130                /* First iteration of G */
131                fill_block_with_xor(zero_block, (uint8_t *) &input_block.v,
132                                    (uint8_t *) &tmp_block.v);
133                /* Second iteration of G */
134                fill_block_with_xor(zero2_block, (uint8_t *) &tmp_block.v,
135                                    (uint8_t *) &address_block.v);
136            }
137
138            pseudo_rands[i] = address_block.v[i % ARGON2_ADDRESSES_IN_BLOCK];
139        }
140    }
141}
142
143void
144fill_segment_avx2(const argon2_instance_t *instance,
145                  argon2_position_t        position)
146{
147    block    *ref_block = NULL, *curr_block = NULL;
148    uint64_t  pseudo_rand, ref_index, ref_lane;
149    uint32_t  prev_offset, curr_offset;
150    uint32_t  starting_index, i;
151    __m256i   state[ARGON2_HWORDS_IN_BLOCK];
152    int       data_independent_addressing = 1;
153
154    /* Pseudo-random values that determine the reference block position */
155    uint64_t *pseudo_rands = NULL;
156
157    if (instance == NULL) {
158        return;
159    }
160
161    if (instance->type == Argon2_id &&
162        (position.pass != 0 || position.slice >= ARGON2_SYNC_POINTS / 2)) {
163        data_independent_addressing = 0;
164    }
165
166    pseudo_rands = instance->pseudo_rands;
167
168    if (data_independent_addressing) {
169        generate_addresses(instance, &position, pseudo_rands);
170    }
171
172    starting_index = 0;
173
174    if ((0 == position.pass) && (0 == position.slice)) {
175        starting_index = 2; /* we have already generated the first two blocks */
176    }
177
178    /* Offset of the current block */
179    curr_offset = position.lane * instance->lane_length +
180                  position.slice * instance->segment_length + starting_index;
181
182    if (0 == curr_offset % instance->lane_length) {
183        /* Last block in this lane */
184        prev_offset = curr_offset + instance->lane_length - 1;
185    } else {
186        /* Previous block */
187        prev_offset = curr_offset - 1;
188    }
189
190    memcpy(state, ((instance->region->memory + prev_offset)->v),
191           ARGON2_BLOCK_SIZE);
192
193    for (i = starting_index; i < instance->segment_length;
194         ++i, ++curr_offset, ++prev_offset) {
195        /*1.1 Rotating prev_offset if needed */
196        if (curr_offset % instance->lane_length == 1) {
197            prev_offset = curr_offset - 1;
198        }
199
200        /* 1.2 Computing the index of the reference block */
201        /* 1.2.1 Taking pseudo-random value from the previous block */
202        if (data_independent_addressing) {
203#pragma warning(push)
204#pragma warning(disable : 6385)
205            pseudo_rand = pseudo_rands[i];
206#pragma warning(pop)
207        } else {
208            pseudo_rand = instance->region->memory[prev_offset].v[0];
209        }
210
211        /* 1.2.2 Computing the lane of the reference block */
212        ref_lane = ((pseudo_rand >> 32)) % instance->lanes;
213
214        if ((position.pass == 0) && (position.slice == 0)) {
215            /* Can not reference other lanes yet */
216            ref_lane = position.lane;
217        }
218
219        /* 1.2.3 Computing the number of possible reference block within the
220         * lane.
221         */
222        position.index = i;
223        ref_index = index_alpha(instance, &position, pseudo_rand & 0xFFFFFFFF,
224                                ref_lane == position.lane);
225
226        /* 2 Creating a new block */
227        ref_block = instance->region->memory +
228                    instance->lane_length * ref_lane + ref_index;
229        curr_block = instance->region->memory + curr_offset;
230        if (position.pass != 0) {
231            fill_block_with_xor(state, (uint8_t *) ref_block->v,
232                                (uint8_t *) curr_block->v);
233        } else {
234            fill_block(state, (uint8_t *) ref_block->v,
235                       (uint8_t *) curr_block->v);
236        }
237    }
238}
239#endif
240