1// SPDX-License-Identifier: GPL-2.0-only
2
3/* PIPAPO: PIle PAcket POlicies: AVX2 packet lookup routines
4 *
5 * Copyright (c) 2019-2020 Red Hat GmbH
6 *
7 * Author: Stefano Brivio <sbrivio@redhat.com>
8 */
9
10#include <linux/kernel.h>
11#include <linux/init.h>
12#include <linux/module.h>
13#include <linux/netlink.h>
14#include <linux/netfilter.h>
15#include <linux/netfilter/nf_tables.h>
16#include <net/netfilter/nf_tables_core.h>
17#include <uapi/linux/netfilter/nf_tables.h>
18#include <linux/bitmap.h>
19#include <linux/bitops.h>
20
21#include <linux/compiler.h>
22#include <asm/fpu/api.h>
23
24#include "nft_set_pipapo_avx2.h"
25#include "nft_set_pipapo.h"
26
27#define NFT_PIPAPO_LONGS_PER_M256	(XSAVE_YMM_SIZE / BITS_PER_LONG)
28
29/* Load from memory into YMM register with non-temporal hint ("stream load"),
30 * that is, don't fetch lines from memory into the cache. This avoids pushing
31 * precious packet data out of the cache hierarchy, and is appropriate when:
32 *
33 * - loading buckets from lookup tables, as they are not going to be used
34 *   again before packets are entirely classified
35 *
36 * - loading the result bitmap from the previous field, as it's never used
37 *   again
38 */
39#define NFT_PIPAPO_AVX2_LOAD(reg, loc)					\
40	asm volatile("vmovntdqa %0, %%ymm" #reg : : "m" (loc))
41
42/* Stream a single lookup table bucket into YMM register given lookup table,
43 * group index, value of packet bits, bucket size.
44 */
45#define NFT_PIPAPO_AVX2_BUCKET_LOAD4(reg, lt, group, v, bsize)		\
46	NFT_PIPAPO_AVX2_LOAD(reg,					\
47			     lt[((group) * NFT_PIPAPO_BUCKETS(4) +	\
48				 (v)) * (bsize)])
49#define NFT_PIPAPO_AVX2_BUCKET_LOAD8(reg, lt, group, v, bsize)		\
50	NFT_PIPAPO_AVX2_LOAD(reg,					\
51			     lt[((group) * NFT_PIPAPO_BUCKETS(8) +	\
52				 (v)) * (bsize)])
53
54/* Bitwise AND: the staple operation of this algorithm */
55#define NFT_PIPAPO_AVX2_AND(dst, a, b)					\
56	asm volatile("vpand %ymm" #a ", %ymm" #b ", %ymm" #dst)
57
58/* Jump to label if @reg is zero */
59#define NFT_PIPAPO_AVX2_NOMATCH_GOTO(reg, label)			\
60	asm goto("vptest %%ymm" #reg ", %%ymm" #reg ";"	\
61			  "je %l[" #label "]" : : : : label)
62
63/* Store 256 bits from YMM register into memory. Contrary to bucket load
64 * operation, we don't bypass the cache here, as stored matching results
65 * are always used shortly after.
66 */
67#define NFT_PIPAPO_AVX2_STORE(loc, reg)					\
68	asm volatile("vmovdqa %%ymm" #reg ", %0" : "=m" (loc))
69
70/* Zero out a complete YMM register, @reg */
71#define NFT_PIPAPO_AVX2_ZERO(reg)					\
72	asm volatile("vpxor %ymm" #reg ", %ymm" #reg ", %ymm" #reg)
73
74/**
75 * nft_pipapo_avx2_prepare() - Prepare before main algorithm body
76 *
77 * This zeroes out ymm15, which is later used whenever we need to clear a
78 * memory location, by storing its content into memory.
79 */
80static void nft_pipapo_avx2_prepare(void)
81{
82	NFT_PIPAPO_AVX2_ZERO(15);
83}
84
85/**
86 * nft_pipapo_avx2_fill() - Fill a bitmap region with ones
87 * @data:	Base memory area
88 * @start:	First bit to set
89 * @len:	Count of bits to fill
90 *
91 * This is nothing else than a version of bitmap_set(), as used e.g. by
92 * pipapo_refill(), tailored for the microarchitectures using it and better
93 * suited for the specific usage: it's very likely that we'll set a small number
94 * of bits, not crossing a word boundary, and correct branch prediction is
95 * critical here.
96 *
97 * This function doesn't actually use any AVX2 instruction.
98 */
99static void nft_pipapo_avx2_fill(unsigned long *data, int start, int len)
100{
101	int offset = start % BITS_PER_LONG;
102	unsigned long mask;
103
104	data += start / BITS_PER_LONG;
105
106	if (likely(len == 1)) {
107		*data |= BIT(offset);
108		return;
109	}
110
111	if (likely(len < BITS_PER_LONG || offset)) {
112		if (likely(len + offset <= BITS_PER_LONG)) {
113			*data |= GENMASK(len - 1 + offset, offset);
114			return;
115		}
116
117		*data |= ~0UL << offset;
118		len -= BITS_PER_LONG - offset;
119		data++;
120
121		if (len <= BITS_PER_LONG) {
122			mask = ~0UL >> (BITS_PER_LONG - len);
123			*data |= mask;
124			return;
125		}
126	}
127
128	memset(data, 0xff, len / BITS_PER_BYTE);
129	data += len / BITS_PER_LONG;
130
131	len %= BITS_PER_LONG;
132	if (len)
133		*data |= ~0UL >> (BITS_PER_LONG - len);
134}
135
136/**
137 * nft_pipapo_avx2_refill() - Scan bitmap, select mapping table item, set bits
138 * @offset:	Start from given bitmap (equivalent to bucket) offset, in longs
139 * @map:	Bitmap to be scanned for set bits
140 * @dst:	Destination bitmap
141 * @mt:		Mapping table containing bit set specifiers
142 * @last:	Return index of first set bit, if this is the last field
143 *
144 * This is an alternative implementation of pipapo_refill() suitable for usage
145 * with AVX2 lookup routines: we know there are four words to be scanned, at
146 * a given offset inside the map, for each matching iteration.
147 *
148 * This function doesn't actually use any AVX2 instruction.
149 *
150 * Return: first set bit index if @last, index of first filled word otherwise.
151 */
152static int nft_pipapo_avx2_refill(int offset, unsigned long *map,
153				  unsigned long *dst,
154				  union nft_pipapo_map_bucket *mt, bool last)
155{
156	int ret = -1;
157
158#define NFT_PIPAPO_AVX2_REFILL_ONE_WORD(x)				\
159	do {								\
160		while (map[(x)]) {					\
161			int r = __builtin_ctzl(map[(x)]);		\
162			int i = (offset + (x)) * BITS_PER_LONG + r;	\
163									\
164			if (last)					\
165				return i;				\
166									\
167			nft_pipapo_avx2_fill(dst, mt[i].to, mt[i].n);	\
168									\
169			if (ret == -1)					\
170				ret = mt[i].to;				\
171									\
172			map[(x)] &= ~(1UL << r);			\
173		}							\
174	} while (0)
175
176	NFT_PIPAPO_AVX2_REFILL_ONE_WORD(0);
177	NFT_PIPAPO_AVX2_REFILL_ONE_WORD(1);
178	NFT_PIPAPO_AVX2_REFILL_ONE_WORD(2);
179	NFT_PIPAPO_AVX2_REFILL_ONE_WORD(3);
180#undef NFT_PIPAPO_AVX2_REFILL_ONE_WORD
181
182	return ret;
183}
184
185/**
186 * nft_pipapo_avx2_lookup_4b_2() - AVX2-based lookup for 2 four-bit groups
187 * @map:	Previous match result, used as initial bitmap
188 * @fill:	Destination bitmap to be filled with current match result
189 * @f:		Field, containing lookup and mapping tables
190 * @offset:	Ignore buckets before the given index, no bits are filled there
191 * @pkt:	Packet data, pointer to input nftables register
192 * @first:	If this is the first field, don't source previous result
193 * @last:	Last field: stop at the first match and return bit index
194 *
195 * Load buckets from lookup table corresponding to the values of each 4-bit
196 * group of packet bytes, and perform a bitwise intersection between them. If
197 * this is the first field in the set, simply AND the buckets together
198 * (equivalent to using an all-ones starting bitmap), use the provided starting
199 * bitmap otherwise. Then call nft_pipapo_avx2_refill() to generate the next
200 * working bitmap, @fill.
201 *
202 * This is used for 8-bit fields (i.e. protocol numbers).
203 *
204 * Out-of-order (and superscalar) execution is vital here, so it's critical to
205 * avoid false data dependencies. CPU and compiler could (mostly) take care of
206 * this on their own, but the operation ordering is explicitly given here with
207 * a likely execution order in mind, to highlight possible stalls. That's why
208 * a number of logically distinct operations (i.e. loading buckets, intersecting
209 * buckets) are interleaved.
210 *
211 * Return: -1 on no match, rule index of match if @last, otherwise first long
212 * word index to be checked next (i.e. first filled word).
213 */
214static int nft_pipapo_avx2_lookup_4b_2(unsigned long *map, unsigned long *fill,
215				       const struct nft_pipapo_field *f,
216				       int offset, const u8 *pkt,
217				       bool first, bool last)
218{
219	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
220	u8 pg[2] = { pkt[0] >> 4, pkt[0] & 0xf };
221	unsigned long *lt = f->lt, bsize = f->bsize;
222
223	lt += offset * NFT_PIPAPO_LONGS_PER_M256;
224	for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
225		int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
226
227		if (first) {
228			NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
229			NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
230			NFT_PIPAPO_AVX2_AND(4, 0, 1);
231		} else {
232			NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
233			NFT_PIPAPO_AVX2_LOAD(2, map[i_ul]);
234			NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
235			NFT_PIPAPO_AVX2_NOMATCH_GOTO(2, nothing);
236			NFT_PIPAPO_AVX2_AND(3, 0, 1);
237			NFT_PIPAPO_AVX2_AND(4, 2, 3);
238		}
239
240		NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
241		NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
242
243		b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
244		if (last)
245			return b;
246
247		if (unlikely(ret == -1))
248			ret = b / XSAVE_YMM_SIZE;
249
250		continue;
251nomatch:
252		NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
253nothing:
254		;
255	}
256
257	return ret;
258}
259
260/**
261 * nft_pipapo_avx2_lookup_4b_4() - AVX2-based lookup for 4 four-bit groups
262 * @map:	Previous match result, used as initial bitmap
263 * @fill:	Destination bitmap to be filled with current match result
264 * @f:		Field, containing lookup and mapping tables
265 * @offset:	Ignore buckets before the given index, no bits are filled there
266 * @pkt:	Packet data, pointer to input nftables register
267 * @first:	If this is the first field, don't source previous result
268 * @last:	Last field: stop at the first match and return bit index
269 *
270 * See nft_pipapo_avx2_lookup_4b_2().
271 *
272 * This is used for 16-bit fields (i.e. ports).
273 *
274 * Return: -1 on no match, rule index of match if @last, otherwise first long
275 * word index to be checked next (i.e. first filled word).
276 */
277static int nft_pipapo_avx2_lookup_4b_4(unsigned long *map, unsigned long *fill,
278				       const struct nft_pipapo_field *f,
279				       int offset, const u8 *pkt,
280				       bool first, bool last)
281{
282	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
283	u8 pg[4] = { pkt[0] >> 4, pkt[0] & 0xf, pkt[1] >> 4, pkt[1] & 0xf };
284	unsigned long *lt = f->lt, bsize = f->bsize;
285
286	lt += offset * NFT_PIPAPO_LONGS_PER_M256;
287	for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
288		int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
289
290		if (first) {
291			NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
292			NFT_PIPAPO_AVX2_BUCKET_LOAD4(1, lt, 1, pg[1], bsize);
293			NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 2, pg[2], bsize);
294			NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 3, pg[3], bsize);
295			NFT_PIPAPO_AVX2_AND(4, 0, 1);
296			NFT_PIPAPO_AVX2_AND(5, 2, 3);
297			NFT_PIPAPO_AVX2_AND(7, 4, 5);
298		} else {
299			NFT_PIPAPO_AVX2_BUCKET_LOAD4(0, lt, 0, pg[0], bsize);
300
301			NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
302
303			NFT_PIPAPO_AVX2_BUCKET_LOAD4(2, lt, 1, pg[1], bsize);
304			NFT_PIPAPO_AVX2_BUCKET_LOAD4(3, lt, 2, pg[2], bsize);
305			NFT_PIPAPO_AVX2_BUCKET_LOAD4(4, lt, 3, pg[3], bsize);
306			NFT_PIPAPO_AVX2_AND(5, 0, 1);
307
308			NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
309
310			NFT_PIPAPO_AVX2_AND(6, 2, 3);
311			NFT_PIPAPO_AVX2_AND(7, 4, 5);
312			/* Stall */
313			NFT_PIPAPO_AVX2_AND(7, 6, 7);
314		}
315
316		/* Stall */
317		NFT_PIPAPO_AVX2_NOMATCH_GOTO(7, nomatch);
318		NFT_PIPAPO_AVX2_STORE(map[i_ul], 7);
319
320		b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
321		if (last)
322			return b;
323
324		if (unlikely(ret == -1))
325			ret = b / XSAVE_YMM_SIZE;
326
327		continue;
328nomatch:
329		NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
330nothing:
331		;
332	}
333
334	return ret;
335}
336
337/**
338 * nft_pipapo_avx2_lookup_4b_8() - AVX2-based lookup for 8 four-bit groups
339 * @map:	Previous match result, used as initial bitmap
340 * @fill:	Destination bitmap to be filled with current match result
341 * @f:		Field, containing lookup and mapping tables
342 * @offset:	Ignore buckets before the given index, no bits are filled there
343 * @pkt:	Packet data, pointer to input nftables register
344 * @first:	If this is the first field, don't source previous result
345 * @last:	Last field: stop at the first match and return bit index
346 *
347 * See nft_pipapo_avx2_lookup_4b_2().
348 *
349 * This is used for 32-bit fields (i.e. IPv4 addresses).
350 *
351 * Return: -1 on no match, rule index of match if @last, otherwise first long
352 * word index to be checked next (i.e. first filled word).
353 */
354static int nft_pipapo_avx2_lookup_4b_8(unsigned long *map, unsigned long *fill,
355				       const struct nft_pipapo_field *f,
356				       int offset, const u8 *pkt,
357				       bool first, bool last)
358{
359	u8 pg[8] = {  pkt[0] >> 4,  pkt[0] & 0xf,  pkt[1] >> 4,  pkt[1] & 0xf,
360		      pkt[2] >> 4,  pkt[2] & 0xf,  pkt[3] >> 4,  pkt[3] & 0xf,
361		   };
362	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
363	unsigned long *lt = f->lt, bsize = f->bsize;
364
365	lt += offset * NFT_PIPAPO_LONGS_PER_M256;
366	for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
367		int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
368
369		if (first) {
370			NFT_PIPAPO_AVX2_BUCKET_LOAD4(0,  lt, 0, pg[0], bsize);
371			NFT_PIPAPO_AVX2_BUCKET_LOAD4(1,  lt, 1, pg[1], bsize);
372			NFT_PIPAPO_AVX2_BUCKET_LOAD4(2,  lt, 2, pg[2], bsize);
373			NFT_PIPAPO_AVX2_BUCKET_LOAD4(3,  lt, 3, pg[3], bsize);
374			NFT_PIPAPO_AVX2_BUCKET_LOAD4(4,  lt, 4, pg[4], bsize);
375			NFT_PIPAPO_AVX2_AND(5,   0,  1);
376			NFT_PIPAPO_AVX2_BUCKET_LOAD4(6,  lt, 5, pg[5], bsize);
377			NFT_PIPAPO_AVX2_BUCKET_LOAD4(7,  lt, 6, pg[6], bsize);
378			NFT_PIPAPO_AVX2_AND(8,   2,  3);
379			NFT_PIPAPO_AVX2_AND(9,   4,  5);
380			NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 7, pg[7], bsize);
381			NFT_PIPAPO_AVX2_AND(11,  6,  7);
382			NFT_PIPAPO_AVX2_AND(12,  8,  9);
383			NFT_PIPAPO_AVX2_AND(13, 10, 11);
384
385			/* Stall */
386			NFT_PIPAPO_AVX2_AND(1,  12, 13);
387		} else {
388			NFT_PIPAPO_AVX2_BUCKET_LOAD4(0,  lt, 0, pg[0], bsize);
389			NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
390			NFT_PIPAPO_AVX2_BUCKET_LOAD4(2,  lt, 1, pg[1], bsize);
391			NFT_PIPAPO_AVX2_BUCKET_LOAD4(3,  lt, 2, pg[2], bsize);
392			NFT_PIPAPO_AVX2_BUCKET_LOAD4(4,  lt, 3, pg[3], bsize);
393
394			NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
395
396			NFT_PIPAPO_AVX2_AND(5,   0,  1);
397			NFT_PIPAPO_AVX2_BUCKET_LOAD4(6,  lt, 4, pg[4], bsize);
398			NFT_PIPAPO_AVX2_BUCKET_LOAD4(7,  lt, 5, pg[5], bsize);
399			NFT_PIPAPO_AVX2_AND(8,   2,  3);
400			NFT_PIPAPO_AVX2_BUCKET_LOAD4(9,  lt, 6, pg[6], bsize);
401			NFT_PIPAPO_AVX2_AND(10,  4,  5);
402			NFT_PIPAPO_AVX2_BUCKET_LOAD4(11, lt, 7, pg[7], bsize);
403			NFT_PIPAPO_AVX2_AND(12,  6,  7);
404			NFT_PIPAPO_AVX2_AND(13,  8,  9);
405			NFT_PIPAPO_AVX2_AND(14, 10, 11);
406
407			/* Stall */
408			NFT_PIPAPO_AVX2_AND(1,  12, 13);
409			NFT_PIPAPO_AVX2_AND(1,   1, 14);
410		}
411
412		NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nomatch);
413		NFT_PIPAPO_AVX2_STORE(map[i_ul], 1);
414
415		b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
416		if (last)
417			return b;
418
419		if (unlikely(ret == -1))
420			ret = b / XSAVE_YMM_SIZE;
421
422		continue;
423
424nomatch:
425		NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
426nothing:
427		;
428	}
429
430	return ret;
431}
432
433/**
434 * nft_pipapo_avx2_lookup_4b_12() - AVX2-based lookup for 12 four-bit groups
435 * @map:	Previous match result, used as initial bitmap
436 * @fill:	Destination bitmap to be filled with current match result
437 * @f:		Field, containing lookup and mapping tables
438 * @offset:	Ignore buckets before the given index, no bits are filled there
439 * @pkt:	Packet data, pointer to input nftables register
440 * @first:	If this is the first field, don't source previous result
441 * @last:	Last field: stop at the first match and return bit index
442 *
443 * See nft_pipapo_avx2_lookup_4b_2().
444 *
445 * This is used for 48-bit fields (i.e. MAC addresses/EUI-48).
446 *
447 * Return: -1 on no match, rule index of match if @last, otherwise first long
448 * word index to be checked next (i.e. first filled word).
449 */
450static int nft_pipapo_avx2_lookup_4b_12(unsigned long *map, unsigned long *fill,
451					const struct nft_pipapo_field *f,
452					int offset, const u8 *pkt,
453					bool first, bool last)
454{
455	u8 pg[12] = {  pkt[0] >> 4,  pkt[0] & 0xf,  pkt[1] >> 4,  pkt[1] & 0xf,
456		       pkt[2] >> 4,  pkt[2] & 0xf,  pkt[3] >> 4,  pkt[3] & 0xf,
457		       pkt[4] >> 4,  pkt[4] & 0xf,  pkt[5] >> 4,  pkt[5] & 0xf,
458		    };
459	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
460	unsigned long *lt = f->lt, bsize = f->bsize;
461
462	lt += offset * NFT_PIPAPO_LONGS_PER_M256;
463	for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
464		int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
465
466		if (!first)
467			NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
468
469		NFT_PIPAPO_AVX2_BUCKET_LOAD4(1,  lt,  0,  pg[0], bsize);
470		NFT_PIPAPO_AVX2_BUCKET_LOAD4(2,  lt,  1,  pg[1], bsize);
471		NFT_PIPAPO_AVX2_BUCKET_LOAD4(3,  lt,  2,  pg[2], bsize);
472
473		if (!first) {
474			NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
475			NFT_PIPAPO_AVX2_AND(1, 1, 0);
476		}
477
478		NFT_PIPAPO_AVX2_BUCKET_LOAD4(4,  lt,  3,  pg[3], bsize);
479		NFT_PIPAPO_AVX2_BUCKET_LOAD4(5,  lt,  4,  pg[4], bsize);
480		NFT_PIPAPO_AVX2_AND(6,   2,  3);
481		NFT_PIPAPO_AVX2_BUCKET_LOAD4(7,  lt,  5,  pg[5], bsize);
482		NFT_PIPAPO_AVX2_BUCKET_LOAD4(8,  lt,  6,  pg[6], bsize);
483		NFT_PIPAPO_AVX2_AND(9,   1,  4);
484		NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt,  7,  pg[7], bsize);
485		NFT_PIPAPO_AVX2_AND(11,  5,  6);
486		NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt,  8,  pg[8], bsize);
487		NFT_PIPAPO_AVX2_AND(13,  7,  8);
488		NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt,  9,  pg[9], bsize);
489
490		NFT_PIPAPO_AVX2_AND(0,   9, 10);
491		NFT_PIPAPO_AVX2_BUCKET_LOAD4(1,  lt, 10,  pg[10], bsize);
492		NFT_PIPAPO_AVX2_AND(2,  11, 12);
493		NFT_PIPAPO_AVX2_BUCKET_LOAD4(3,  lt, 11,  pg[11], bsize);
494		NFT_PIPAPO_AVX2_AND(4,  13, 14);
495		NFT_PIPAPO_AVX2_AND(5,   0,  1);
496
497		NFT_PIPAPO_AVX2_AND(6,   2,  3);
498
499		/* Stalls */
500		NFT_PIPAPO_AVX2_AND(7,   4,  5);
501		NFT_PIPAPO_AVX2_AND(8,   6,  7);
502
503		NFT_PIPAPO_AVX2_NOMATCH_GOTO(8, nomatch);
504		NFT_PIPAPO_AVX2_STORE(map[i_ul], 8);
505
506		b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
507		if (last)
508			return b;
509
510		if (unlikely(ret == -1))
511			ret = b / XSAVE_YMM_SIZE;
512
513		continue;
514nomatch:
515		NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
516nothing:
517		;
518	}
519
520	return ret;
521}
522
523/**
524 * nft_pipapo_avx2_lookup_4b_32() - AVX2-based lookup for 32 four-bit groups
525 * @map:	Previous match result, used as initial bitmap
526 * @fill:	Destination bitmap to be filled with current match result
527 * @f:		Field, containing lookup and mapping tables
528 * @offset:	Ignore buckets before the given index, no bits are filled there
529 * @pkt:	Packet data, pointer to input nftables register
530 * @first:	If this is the first field, don't source previous result
531 * @last:	Last field: stop at the first match and return bit index
532 *
533 * See nft_pipapo_avx2_lookup_4b_2().
534 *
535 * This is used for 128-bit fields (i.e. IPv6 addresses).
536 *
537 * Return: -1 on no match, rule index of match if @last, otherwise first long
538 * word index to be checked next (i.e. first filled word).
539 */
540static int nft_pipapo_avx2_lookup_4b_32(unsigned long *map, unsigned long *fill,
541					const struct nft_pipapo_field *f,
542					int offset, const u8 *pkt,
543					bool first, bool last)
544{
545	u8 pg[32] = {  pkt[0] >> 4,  pkt[0] & 0xf,  pkt[1] >> 4,  pkt[1] & 0xf,
546		       pkt[2] >> 4,  pkt[2] & 0xf,  pkt[3] >> 4,  pkt[3] & 0xf,
547		       pkt[4] >> 4,  pkt[4] & 0xf,  pkt[5] >> 4,  pkt[5] & 0xf,
548		       pkt[6] >> 4,  pkt[6] & 0xf,  pkt[7] >> 4,  pkt[7] & 0xf,
549		       pkt[8] >> 4,  pkt[8] & 0xf,  pkt[9] >> 4,  pkt[9] & 0xf,
550		      pkt[10] >> 4, pkt[10] & 0xf, pkt[11] >> 4, pkt[11] & 0xf,
551		      pkt[12] >> 4, pkt[12] & 0xf, pkt[13] >> 4, pkt[13] & 0xf,
552		      pkt[14] >> 4, pkt[14] & 0xf, pkt[15] >> 4, pkt[15] & 0xf,
553		    };
554	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
555	unsigned long *lt = f->lt, bsize = f->bsize;
556
557	lt += offset * NFT_PIPAPO_LONGS_PER_M256;
558	for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
559		int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
560
561		if (!first)
562			NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
563
564		NFT_PIPAPO_AVX2_BUCKET_LOAD4(1,  lt,  0,  pg[0], bsize);
565		NFT_PIPAPO_AVX2_BUCKET_LOAD4(2,  lt,  1,  pg[1], bsize);
566		NFT_PIPAPO_AVX2_BUCKET_LOAD4(3,  lt,  2,  pg[2], bsize);
567		NFT_PIPAPO_AVX2_BUCKET_LOAD4(4,  lt,  3,  pg[3], bsize);
568		if (!first) {
569			NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
570			NFT_PIPAPO_AVX2_AND(1, 1, 0);
571		}
572
573		NFT_PIPAPO_AVX2_AND(5,   2,  3);
574		NFT_PIPAPO_AVX2_BUCKET_LOAD4(6,  lt,  4,  pg[4], bsize);
575		NFT_PIPAPO_AVX2_BUCKET_LOAD4(7,  lt,  5,  pg[5], bsize);
576		NFT_PIPAPO_AVX2_AND(8,   1,  4);
577		NFT_PIPAPO_AVX2_BUCKET_LOAD4(9,  lt,  6,  pg[6], bsize);
578		NFT_PIPAPO_AVX2_AND(10,  5,  6);
579		NFT_PIPAPO_AVX2_BUCKET_LOAD4(11, lt,  7,  pg[7], bsize);
580		NFT_PIPAPO_AVX2_AND(12,  7,  8);
581		NFT_PIPAPO_AVX2_BUCKET_LOAD4(13, lt,  8,  pg[8], bsize);
582		NFT_PIPAPO_AVX2_AND(14,  9, 10);
583
584		NFT_PIPAPO_AVX2_BUCKET_LOAD4(0,  lt,  9,  pg[9], bsize);
585		NFT_PIPAPO_AVX2_AND(1,  11, 12);
586		NFT_PIPAPO_AVX2_BUCKET_LOAD4(2,  lt, 10, pg[10], bsize);
587		NFT_PIPAPO_AVX2_BUCKET_LOAD4(3,  lt, 11, pg[11], bsize);
588		NFT_PIPAPO_AVX2_AND(4,  13, 14);
589		NFT_PIPAPO_AVX2_BUCKET_LOAD4(5,  lt, 12, pg[12], bsize);
590		NFT_PIPAPO_AVX2_BUCKET_LOAD4(6,  lt, 13, pg[13], bsize);
591		NFT_PIPAPO_AVX2_AND(7,   0,  1);
592		NFT_PIPAPO_AVX2_BUCKET_LOAD4(8,  lt, 14, pg[14], bsize);
593		NFT_PIPAPO_AVX2_AND(9,   2,  3);
594		NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 15, pg[15], bsize);
595		NFT_PIPAPO_AVX2_AND(11,  4,  5);
596		NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 16, pg[16], bsize);
597		NFT_PIPAPO_AVX2_AND(13,  6,  7);
598		NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 17, pg[17], bsize);
599
600		NFT_PIPAPO_AVX2_AND(0,   8,  9);
601		NFT_PIPAPO_AVX2_BUCKET_LOAD4(1,  lt, 18, pg[18], bsize);
602		NFT_PIPAPO_AVX2_AND(2,  10, 11);
603		NFT_PIPAPO_AVX2_BUCKET_LOAD4(3,  lt, 19, pg[19], bsize);
604		NFT_PIPAPO_AVX2_AND(4,  12, 13);
605		NFT_PIPAPO_AVX2_BUCKET_LOAD4(5,  lt, 20, pg[20], bsize);
606		NFT_PIPAPO_AVX2_AND(6,  14,  0);
607		NFT_PIPAPO_AVX2_AND(7,   1,  2);
608		NFT_PIPAPO_AVX2_BUCKET_LOAD4(8,  lt, 21, pg[21], bsize);
609		NFT_PIPAPO_AVX2_AND(9,   3,  4);
610		NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 22, pg[22], bsize);
611		NFT_PIPAPO_AVX2_AND(11,  5,  6);
612		NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 23, pg[23], bsize);
613		NFT_PIPAPO_AVX2_AND(13,  7,  8);
614
615		NFT_PIPAPO_AVX2_BUCKET_LOAD4(14, lt, 24, pg[24], bsize);
616		NFT_PIPAPO_AVX2_BUCKET_LOAD4(0,  lt, 25, pg[25], bsize);
617		NFT_PIPAPO_AVX2_AND(1,   9, 10);
618		NFT_PIPAPO_AVX2_AND(2,  11, 12);
619		NFT_PIPAPO_AVX2_BUCKET_LOAD4(3,  lt, 26, pg[26], bsize);
620		NFT_PIPAPO_AVX2_AND(4,  13, 14);
621		NFT_PIPAPO_AVX2_BUCKET_LOAD4(5,  lt, 27, pg[27], bsize);
622		NFT_PIPAPO_AVX2_AND(6,   0,  1);
623		NFT_PIPAPO_AVX2_BUCKET_LOAD4(7,  lt, 28, pg[28], bsize);
624		NFT_PIPAPO_AVX2_BUCKET_LOAD4(8,  lt, 29, pg[29], bsize);
625		NFT_PIPAPO_AVX2_AND(9,   2,  3);
626		NFT_PIPAPO_AVX2_BUCKET_LOAD4(10, lt, 30, pg[30], bsize);
627		NFT_PIPAPO_AVX2_AND(11,  4,  5);
628		NFT_PIPAPO_AVX2_BUCKET_LOAD4(12, lt, 31, pg[31], bsize);
629
630		NFT_PIPAPO_AVX2_AND(0,   6,  7);
631		NFT_PIPAPO_AVX2_AND(1,   8,  9);
632		NFT_PIPAPO_AVX2_AND(2,  10, 11);
633		NFT_PIPAPO_AVX2_AND(3,  12,  0);
634
635		/* Stalls */
636		NFT_PIPAPO_AVX2_AND(4,   1,  2);
637		NFT_PIPAPO_AVX2_AND(5,   3,  4);
638
639		NFT_PIPAPO_AVX2_NOMATCH_GOTO(5, nomatch);
640		NFT_PIPAPO_AVX2_STORE(map[i_ul], 5);
641
642		b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
643		if (last)
644			return b;
645
646		if (unlikely(ret == -1))
647			ret = b / XSAVE_YMM_SIZE;
648
649		continue;
650nomatch:
651		NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
652nothing:
653		;
654	}
655
656	return ret;
657}
658
659/**
660 * nft_pipapo_avx2_lookup_8b_1() - AVX2-based lookup for one eight-bit group
661 * @map:	Previous match result, used as initial bitmap
662 * @fill:	Destination bitmap to be filled with current match result
663 * @f:		Field, containing lookup and mapping tables
664 * @offset:	Ignore buckets before the given index, no bits are filled there
665 * @pkt:	Packet data, pointer to input nftables register
666 * @first:	If this is the first field, don't source previous result
667 * @last:	Last field: stop at the first match and return bit index
668 *
669 * See nft_pipapo_avx2_lookup_4b_2().
670 *
671 * This is used for 8-bit fields (i.e. protocol numbers).
672 *
673 * Return: -1 on no match, rule index of match if @last, otherwise first long
674 * word index to be checked next (i.e. first filled word).
675 */
676static int nft_pipapo_avx2_lookup_8b_1(unsigned long *map, unsigned long *fill,
677				       const struct nft_pipapo_field *f,
678				       int offset, const u8 *pkt,
679				       bool first, bool last)
680{
681	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
682	unsigned long *lt = f->lt, bsize = f->bsize;
683
684	lt += offset * NFT_PIPAPO_LONGS_PER_M256;
685	for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
686		int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
687
688		if (first) {
689			NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 0, pkt[0], bsize);
690		} else {
691			NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
692			NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
693			NFT_PIPAPO_AVX2_AND(2, 0, 1);
694			NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
695		}
696
697		NFT_PIPAPO_AVX2_NOMATCH_GOTO(2, nomatch);
698		NFT_PIPAPO_AVX2_STORE(map[i_ul], 2);
699
700		b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
701		if (last)
702			return b;
703
704		if (unlikely(ret == -1))
705			ret = b / XSAVE_YMM_SIZE;
706
707		continue;
708nomatch:
709		NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
710nothing:
711		;
712	}
713
714	return ret;
715}
716
717/**
718 * nft_pipapo_avx2_lookup_8b_2() - AVX2-based lookup for 2 eight-bit groups
719 * @map:	Previous match result, used as initial bitmap
720 * @fill:	Destination bitmap to be filled with current match result
721 * @f:		Field, containing lookup and mapping tables
722 * @offset:	Ignore buckets before the given index, no bits are filled there
723 * @pkt:	Packet data, pointer to input nftables register
724 * @first:	If this is the first field, don't source previous result
725 * @last:	Last field: stop at the first match and return bit index
726 *
727 * See nft_pipapo_avx2_lookup_4b_2().
728 *
729 * This is used for 16-bit fields (i.e. ports).
730 *
731 * Return: -1 on no match, rule index of match if @last, otherwise first long
732 * word index to be checked next (i.e. first filled word).
733 */
734static int nft_pipapo_avx2_lookup_8b_2(unsigned long *map, unsigned long *fill,
735				       const struct nft_pipapo_field *f,
736				       int offset, const u8 *pkt,
737				       bool first, bool last)
738{
739	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
740	unsigned long *lt = f->lt, bsize = f->bsize;
741
742	lt += offset * NFT_PIPAPO_LONGS_PER_M256;
743	for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
744		int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
745
746		if (first) {
747			NFT_PIPAPO_AVX2_BUCKET_LOAD8(0, lt, 0, pkt[0], bsize);
748			NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 1, pkt[1], bsize);
749			NFT_PIPAPO_AVX2_AND(4, 0, 1);
750		} else {
751			NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
752			NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 0, pkt[0], bsize);
753			NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt, 1, pkt[1], bsize);
754
755			/* Stall */
756			NFT_PIPAPO_AVX2_AND(3, 0, 1);
757			NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
758			NFT_PIPAPO_AVX2_AND(4, 3, 2);
759		}
760
761		/* Stall */
762		NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
763		NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
764
765		b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
766		if (last)
767			return b;
768
769		if (unlikely(ret == -1))
770			ret = b / XSAVE_YMM_SIZE;
771
772		continue;
773nomatch:
774		NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
775nothing:
776		;
777	}
778
779	return ret;
780}
781
782/**
783 * nft_pipapo_avx2_lookup_8b_4() - AVX2-based lookup for 4 eight-bit groups
784 * @map:	Previous match result, used as initial bitmap
785 * @fill:	Destination bitmap to be filled with current match result
786 * @f:		Field, containing lookup and mapping tables
787 * @offset:	Ignore buckets before the given index, no bits are filled there
788 * @pkt:	Packet data, pointer to input nftables register
789 * @first:	If this is the first field, don't source previous result
790 * @last:	Last field: stop at the first match and return bit index
791 *
792 * See nft_pipapo_avx2_lookup_4b_2().
793 *
794 * This is used for 32-bit fields (i.e. IPv4 addresses).
795 *
796 * Return: -1 on no match, rule index of match if @last, otherwise first long
797 * word index to be checked next (i.e. first filled word).
798 */
799static int nft_pipapo_avx2_lookup_8b_4(unsigned long *map, unsigned long *fill,
800				       const struct nft_pipapo_field *f,
801				       int offset, const u8 *pkt,
802				       bool first, bool last)
803{
804	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
805	unsigned long *lt = f->lt, bsize = f->bsize;
806
807	lt += offset * NFT_PIPAPO_LONGS_PER_M256;
808	for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
809		int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
810
811		if (first) {
812			NFT_PIPAPO_AVX2_BUCKET_LOAD8(0,  lt, 0, pkt[0], bsize);
813			NFT_PIPAPO_AVX2_BUCKET_LOAD8(1,  lt, 1, pkt[1], bsize);
814			NFT_PIPAPO_AVX2_BUCKET_LOAD8(2,  lt, 2, pkt[2], bsize);
815			NFT_PIPAPO_AVX2_BUCKET_LOAD8(3,  lt, 3, pkt[3], bsize);
816
817			/* Stall */
818			NFT_PIPAPO_AVX2_AND(4, 0, 1);
819			NFT_PIPAPO_AVX2_AND(5, 2, 3);
820			NFT_PIPAPO_AVX2_AND(0, 4, 5);
821		} else {
822			NFT_PIPAPO_AVX2_BUCKET_LOAD8(0,  lt, 0, pkt[0], bsize);
823			NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
824			NFT_PIPAPO_AVX2_BUCKET_LOAD8(2,  lt, 1, pkt[1], bsize);
825			NFT_PIPAPO_AVX2_BUCKET_LOAD8(3,  lt, 2, pkt[2], bsize);
826			NFT_PIPAPO_AVX2_BUCKET_LOAD8(4,  lt, 3, pkt[3], bsize);
827
828			NFT_PIPAPO_AVX2_AND(5, 0, 1);
829			NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
830			NFT_PIPAPO_AVX2_AND(6, 2, 3);
831
832			/* Stall */
833			NFT_PIPAPO_AVX2_AND(7, 4, 5);
834			NFT_PIPAPO_AVX2_AND(0, 6, 7);
835		}
836
837		NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nomatch);
838		NFT_PIPAPO_AVX2_STORE(map[i_ul], 0);
839
840		b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
841		if (last)
842			return b;
843
844		if (unlikely(ret == -1))
845			ret = b / XSAVE_YMM_SIZE;
846
847		continue;
848
849nomatch:
850		NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
851nothing:
852		;
853	}
854
855	return ret;
856}
857
858/**
859 * nft_pipapo_avx2_lookup_8b_6() - AVX2-based lookup for 6 eight-bit groups
860 * @map:	Previous match result, used as initial bitmap
861 * @fill:	Destination bitmap to be filled with current match result
862 * @f:		Field, containing lookup and mapping tables
863 * @offset:	Ignore buckets before the given index, no bits are filled there
864 * @pkt:	Packet data, pointer to input nftables register
865 * @first:	If this is the first field, don't source previous result
866 * @last:	Last field: stop at the first match and return bit index
867 *
868 * See nft_pipapo_avx2_lookup_4b_2().
869 *
870 * This is used for 48-bit fields (i.e. MAC addresses/EUI-48).
871 *
872 * Return: -1 on no match, rule index of match if @last, otherwise first long
873 * word index to be checked next (i.e. first filled word).
874 */
875static int nft_pipapo_avx2_lookup_8b_6(unsigned long *map, unsigned long *fill,
876				       const struct nft_pipapo_field *f,
877				       int offset, const u8 *pkt,
878				       bool first, bool last)
879{
880	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
881	unsigned long *lt = f->lt, bsize = f->bsize;
882
883	lt += offset * NFT_PIPAPO_LONGS_PER_M256;
884	for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
885		int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
886
887		if (first) {
888			NFT_PIPAPO_AVX2_BUCKET_LOAD8(0,  lt, 0, pkt[0], bsize);
889			NFT_PIPAPO_AVX2_BUCKET_LOAD8(1,  lt, 1, pkt[1], bsize);
890			NFT_PIPAPO_AVX2_BUCKET_LOAD8(2,  lt, 2, pkt[2], bsize);
891			NFT_PIPAPO_AVX2_BUCKET_LOAD8(3,  lt, 3, pkt[3], bsize);
892			NFT_PIPAPO_AVX2_BUCKET_LOAD8(4,  lt, 4, pkt[4], bsize);
893
894			NFT_PIPAPO_AVX2_AND(5, 0, 1);
895			NFT_PIPAPO_AVX2_BUCKET_LOAD8(6,  lt, 5, pkt[5], bsize);
896			NFT_PIPAPO_AVX2_AND(7, 2, 3);
897
898			/* Stall */
899			NFT_PIPAPO_AVX2_AND(0, 4, 5);
900			NFT_PIPAPO_AVX2_AND(1, 6, 7);
901			NFT_PIPAPO_AVX2_AND(4, 0, 1);
902		} else {
903			NFT_PIPAPO_AVX2_BUCKET_LOAD8(0,  lt, 0, pkt[0], bsize);
904			NFT_PIPAPO_AVX2_LOAD(1, map[i_ul]);
905			NFT_PIPAPO_AVX2_BUCKET_LOAD8(2,  lt, 1, pkt[1], bsize);
906			NFT_PIPAPO_AVX2_BUCKET_LOAD8(3,  lt, 2, pkt[2], bsize);
907			NFT_PIPAPO_AVX2_BUCKET_LOAD8(4,  lt, 3, pkt[3], bsize);
908
909			NFT_PIPAPO_AVX2_AND(5, 0, 1);
910			NFT_PIPAPO_AVX2_NOMATCH_GOTO(1, nothing);
911
912			NFT_PIPAPO_AVX2_AND(6, 2, 3);
913			NFT_PIPAPO_AVX2_BUCKET_LOAD8(7,  lt, 4, pkt[4], bsize);
914			NFT_PIPAPO_AVX2_AND(0, 4, 5);
915			NFT_PIPAPO_AVX2_BUCKET_LOAD8(1,  lt, 5, pkt[5], bsize);
916			NFT_PIPAPO_AVX2_AND(2, 6, 7);
917
918			/* Stall */
919			NFT_PIPAPO_AVX2_AND(3, 0, 1);
920			NFT_PIPAPO_AVX2_AND(4, 2, 3);
921		}
922
923		NFT_PIPAPO_AVX2_NOMATCH_GOTO(4, nomatch);
924		NFT_PIPAPO_AVX2_STORE(map[i_ul], 4);
925
926		b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
927		if (last)
928			return b;
929
930		if (unlikely(ret == -1))
931			ret = b / XSAVE_YMM_SIZE;
932
933		continue;
934
935nomatch:
936		NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
937nothing:
938		;
939	}
940
941	return ret;
942}
943
944/**
945 * nft_pipapo_avx2_lookup_8b_16() - AVX2-based lookup for 16 eight-bit groups
946 * @map:	Previous match result, used as initial bitmap
947 * @fill:	Destination bitmap to be filled with current match result
948 * @f:		Field, containing lookup and mapping tables
949 * @offset:	Ignore buckets before the given index, no bits are filled there
950 * @pkt:	Packet data, pointer to input nftables register
951 * @first:	If this is the first field, don't source previous result
952 * @last:	Last field: stop at the first match and return bit index
953 *
954 * See nft_pipapo_avx2_lookup_4b_2().
955 *
956 * This is used for 128-bit fields (i.e. IPv6 addresses).
957 *
958 * Return: -1 on no match, rule index of match if @last, otherwise first long
959 * word index to be checked next (i.e. first filled word).
960 */
961static int nft_pipapo_avx2_lookup_8b_16(unsigned long *map, unsigned long *fill,
962					const struct nft_pipapo_field *f,
963					int offset, const u8 *pkt,
964					bool first, bool last)
965{
966	int i, ret = -1, m256_size = f->bsize / NFT_PIPAPO_LONGS_PER_M256, b;
967	unsigned long *lt = f->lt, bsize = f->bsize;
968
969	lt += offset * NFT_PIPAPO_LONGS_PER_M256;
970	for (i = offset; i < m256_size; i++, lt += NFT_PIPAPO_LONGS_PER_M256) {
971		int i_ul = i * NFT_PIPAPO_LONGS_PER_M256;
972
973		if (!first)
974			NFT_PIPAPO_AVX2_LOAD(0, map[i_ul]);
975
976		NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt,  0,  pkt[0], bsize);
977		NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt,  1,  pkt[1], bsize);
978		NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt,  2,  pkt[2], bsize);
979		if (!first) {
980			NFT_PIPAPO_AVX2_NOMATCH_GOTO(0, nothing);
981			NFT_PIPAPO_AVX2_AND(1, 1, 0);
982		}
983		NFT_PIPAPO_AVX2_BUCKET_LOAD8(4, lt,  3,  pkt[3], bsize);
984
985		NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt,  4,  pkt[4], bsize);
986		NFT_PIPAPO_AVX2_AND(6, 1, 2);
987		NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt,  5,  pkt[5], bsize);
988		NFT_PIPAPO_AVX2_AND(0, 3, 4);
989		NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt,  6,  pkt[6], bsize);
990
991		NFT_PIPAPO_AVX2_BUCKET_LOAD8(2, lt,  7,  pkt[7], bsize);
992		NFT_PIPAPO_AVX2_AND(3, 5, 6);
993		NFT_PIPAPO_AVX2_AND(4, 0, 1);
994		NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt,  8,  pkt[8], bsize);
995
996		NFT_PIPAPO_AVX2_AND(6, 2, 3);
997		NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt,  9,  pkt[9], bsize);
998		NFT_PIPAPO_AVX2_AND(0, 4, 5);
999		NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 10, pkt[10], bsize);
1000		NFT_PIPAPO_AVX2_AND(2, 6, 7);
1001		NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 11, pkt[11], bsize);
1002		NFT_PIPAPO_AVX2_AND(4, 0, 1);
1003		NFT_PIPAPO_AVX2_BUCKET_LOAD8(5, lt, 12, pkt[12], bsize);
1004		NFT_PIPAPO_AVX2_AND(6, 2, 3);
1005		NFT_PIPAPO_AVX2_BUCKET_LOAD8(7, lt, 13, pkt[13], bsize);
1006		NFT_PIPAPO_AVX2_AND(0, 4, 5);
1007		NFT_PIPAPO_AVX2_BUCKET_LOAD8(1, lt, 14, pkt[14], bsize);
1008		NFT_PIPAPO_AVX2_AND(2, 6, 7);
1009		NFT_PIPAPO_AVX2_BUCKET_LOAD8(3, lt, 15, pkt[15], bsize);
1010		NFT_PIPAPO_AVX2_AND(4, 0, 1);
1011
1012		/* Stall */
1013		NFT_PIPAPO_AVX2_AND(5, 2, 3);
1014		NFT_PIPAPO_AVX2_AND(6, 4, 5);
1015
1016		NFT_PIPAPO_AVX2_NOMATCH_GOTO(6, nomatch);
1017		NFT_PIPAPO_AVX2_STORE(map[i_ul], 6);
1018
1019		b = nft_pipapo_avx2_refill(i_ul, &map[i_ul], fill, f->mt, last);
1020		if (last)
1021			return b;
1022
1023		if (unlikely(ret == -1))
1024			ret = b / XSAVE_YMM_SIZE;
1025
1026		continue;
1027
1028nomatch:
1029		NFT_PIPAPO_AVX2_STORE(map[i_ul], 15);
1030nothing:
1031		;
1032	}
1033
1034	return ret;
1035}
1036
1037/**
1038 * nft_pipapo_avx2_lookup_slow() - Fallback function for uncommon field sizes
1039 * @map:	Previous match result, used as initial bitmap
1040 * @fill:	Destination bitmap to be filled with current match result
1041 * @f:		Field, containing lookup and mapping tables
1042 * @offset:	Ignore buckets before the given index, no bits are filled there
1043 * @pkt:	Packet data, pointer to input nftables register
1044 * @first:	If this is the first field, don't source previous result
1045 * @last:	Last field: stop at the first match and return bit index
1046 *
1047 * This function should never be called, but is provided for the case the field
1048 * size doesn't match any of the known data types. Matching rate is
1049 * substantially lower than AVX2 routines.
1050 *
1051 * Return: -1 on no match, rule index of match if @last, otherwise first long
1052 * word index to be checked next (i.e. first filled word).
1053 */
1054static int nft_pipapo_avx2_lookup_slow(unsigned long *map, unsigned long *fill,
1055					const struct nft_pipapo_field *f,
1056					int offset, const u8 *pkt,
1057					bool first, bool last)
1058{
1059	unsigned long bsize = f->bsize;
1060	int i, ret = -1, b;
1061
1062	if (first)
1063		memset(map, 0xff, bsize * sizeof(*map));
1064
1065	for (i = offset; i < bsize; i++) {
1066		if (f->bb == 8)
1067			pipapo_and_field_buckets_8bit(f, map, pkt);
1068		else
1069			pipapo_and_field_buckets_4bit(f, map, pkt);
1070		NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4;
1071
1072		b = pipapo_refill(map, bsize, f->rules, fill, f->mt, last);
1073
1074		if (last)
1075			return b;
1076
1077		if (ret == -1)
1078			ret = b / XSAVE_YMM_SIZE;
1079	}
1080
1081	return ret;
1082}
1083
1084/**
1085 * nft_pipapo_avx2_estimate() - Set size, space and lookup complexity
1086 * @desc:	Set description, element count and field description used
1087 * @features:	Flags: NFT_SET_INTERVAL needs to be there
1088 * @est:	Storage for estimation data
1089 *
1090 * Return: true if set is compatible and AVX2 available, false otherwise.
1091 */
1092bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features,
1093			      struct nft_set_estimate *est)
1094{
1095	if (!(features & NFT_SET_INTERVAL) ||
1096	    desc->field_count < NFT_PIPAPO_MIN_FIELDS)
1097		return false;
1098
1099	if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_AVX))
1100		return false;
1101
1102	est->size = pipapo_estimate_size(desc);
1103	if (!est->size)
1104		return false;
1105
1106	est->lookup = NFT_SET_CLASS_O_LOG_N;
1107
1108	est->space = NFT_SET_CLASS_O_N;
1109
1110	return true;
1111}
1112
1113/**
1114 * nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation
1115 * @net:	Network namespace
1116 * @set:	nftables API set representation
1117 * @key:	nftables API element representation containing key data
1118 * @ext:	nftables API extension pointer, filled with matching reference
1119 *
1120 * For more details, see DOC: Theory of Operation in nft_set_pipapo.c.
1121 *
1122 * This implementation exploits the repetitive characteristic of the algorithm
1123 * to provide a fast, vectorised version using the AVX2 SIMD instruction set.
1124 *
1125 * Return: true on match, false otherwise.
1126 */
1127bool nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
1128			    const u32 *key, const struct nft_set_ext **ext)
1129{
1130	struct nft_pipapo *priv = nft_set_priv(set);
1131	struct nft_pipapo_scratch *scratch;
1132	u8 genmask = nft_genmask_cur(net);
1133	const struct nft_pipapo_match *m;
1134	const struct nft_pipapo_field *f;
1135	const u8 *rp = (const u8 *)key;
1136	unsigned long *res, *fill;
1137	bool map_index;
1138	int i, ret = 0;
1139
1140	if (unlikely(!irq_fpu_usable()))
1141		return nft_pipapo_lookup(net, set, key, ext);
1142
1143	m = rcu_dereference(priv->match);
1144
1145	/* This also protects access to all data related to scratch maps.
1146	 *
1147	 * Note that we don't need a valid MXCSR state for any of the
1148	 * operations we use here, so pass 0 as mask and spare a LDMXCSR
1149	 * instruction.
1150	 */
1151	kernel_fpu_begin_mask(0);
1152
1153	scratch = *raw_cpu_ptr(m->scratch);
1154	if (unlikely(!scratch)) {
1155		kernel_fpu_end();
1156		return false;
1157	}
1158
1159	map_index = scratch->map_index;
1160
1161	res  = scratch->map + (map_index ? m->bsize_max : 0);
1162	fill = scratch->map + (map_index ? 0 : m->bsize_max);
1163
1164	/* Starting map doesn't need to be set for this implementation */
1165
1166	nft_pipapo_avx2_prepare();
1167
1168next_match:
1169	nft_pipapo_for_each_field(f, i, m) {
1170		bool last = i == m->field_count - 1, first = !i;
1171
1172#define NFT_SET_PIPAPO_AVX2_LOOKUP(b, n)				\
1173		(ret = nft_pipapo_avx2_lookup_##b##b_##n(res, fill, f,	\
1174							 ret, rp,	\
1175							 first, last))
1176
1177		if (likely(f->bb == 8)) {
1178			if (f->groups == 1) {
1179				NFT_SET_PIPAPO_AVX2_LOOKUP(8, 1);
1180			} else if (f->groups == 2) {
1181				NFT_SET_PIPAPO_AVX2_LOOKUP(8, 2);
1182			} else if (f->groups == 4) {
1183				NFT_SET_PIPAPO_AVX2_LOOKUP(8, 4);
1184			} else if (f->groups == 6) {
1185				NFT_SET_PIPAPO_AVX2_LOOKUP(8, 6);
1186			} else if (f->groups == 16) {
1187				NFT_SET_PIPAPO_AVX2_LOOKUP(8, 16);
1188			} else {
1189				ret = nft_pipapo_avx2_lookup_slow(res, fill, f,
1190								  ret, rp,
1191								  first, last);
1192			}
1193		} else {
1194			if (f->groups == 2) {
1195				NFT_SET_PIPAPO_AVX2_LOOKUP(4, 2);
1196			} else if (f->groups == 4) {
1197				NFT_SET_PIPAPO_AVX2_LOOKUP(4, 4);
1198			} else if (f->groups == 8) {
1199				NFT_SET_PIPAPO_AVX2_LOOKUP(4, 8);
1200			} else if (f->groups == 12) {
1201				NFT_SET_PIPAPO_AVX2_LOOKUP(4, 12);
1202			} else if (f->groups == 32) {
1203				NFT_SET_PIPAPO_AVX2_LOOKUP(4, 32);
1204			} else {
1205				ret = nft_pipapo_avx2_lookup_slow(res, fill, f,
1206								  ret, rp,
1207								  first, last);
1208			}
1209		}
1210		NFT_PIPAPO_GROUP_BITS_ARE_8_OR_4;
1211
1212#undef NFT_SET_PIPAPO_AVX2_LOOKUP
1213
1214		if (ret < 0)
1215			goto out;
1216
1217		if (last) {
1218			*ext = &f->mt[ret].e->ext;
1219			if (unlikely(nft_set_elem_expired(*ext) ||
1220				     !nft_set_elem_active(*ext, genmask))) {
1221				ret = 0;
1222				goto next_match;
1223			}
1224
1225			goto out;
1226		}
1227
1228		swap(res, fill);
1229		rp += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
1230	}
1231
1232out:
1233	if (i % 2)
1234		scratch->map_index = !map_index;
1235	kernel_fpu_end();
1236
1237	return ret >= 0;
1238}
1239