1/*
2 * Compute 16-bit sum in ones' complement arithmetic (with end-around carry).
3 * This sum is often used as a simple checksum in networking.
4 *
5 * Copyright (c) 2020, Arm Limited.
6 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
7 */
8
9#include "networking.h"
10#include "chksum_common.h"
11
12always_inline
13static inline uint32_t
14slurp_head32(const void **pptr, uint32_t *nbytes)
15{
16    uint32_t sum = 0;
17    Assert(*nbytes >= 4);
18    uint32_t off = (uintptr_t) *pptr % 4;
19    if (likely(off != 0))
20    {
21	/* Get rid of bytes 0..off-1 */
22	const unsigned char *ptr32 = align_ptr(*pptr, 4);
23	uint32_t mask = ~0U << (CHAR_BIT * off);
24	sum = load32(ptr32) & mask;
25	*pptr = ptr32 + 4;
26	*nbytes -= 4 - off;
27    }
28    return sum;
29}
30
31/* Additional loop unrolling would help when not auto-vectorizing */
32unsigned short
33__chksum(const void *ptr, unsigned int nbytes)
34{
35    bool swap = false;
36    uint64_t sum = 0;
37
38    if (nbytes > 300)
39    {
40	/* 4-byte align pointer */
41	swap = (uintptr_t) ptr & 1;
42	sum = slurp_head32(&ptr, &nbytes);
43    }
44    /* Else benefit of aligning not worth the overhead */
45
46    /* Sum all 16-byte chunks */
47    const char *cptr = ptr;
48    for (uint32_t nquads = nbytes / 16; nquads != 0; nquads--)
49    {
50	uint64_t h0 = load32(cptr + 0);
51	uint64_t h1 = load32(cptr + 4);
52	uint64_t h2 = load32(cptr + 8);
53	uint64_t h3 = load32(cptr + 12);
54	sum += h0 + h1 + h2 + h3;
55	cptr += 16;
56    }
57    nbytes %= 16;
58    Assert(nbytes < 16);
59
60    /* Handle any trailing 4-byte chunks */
61    while (nbytes >= 4)
62    {
63	sum += load32(cptr);
64	cptr += 4;
65	nbytes -= 4;
66    }
67    Assert(nbytes < 4);
68
69    if (nbytes & 2)
70    {
71	sum += load16(cptr);
72	cptr += 2;
73    }
74
75    if (nbytes & 1)
76    {
77	sum += *(uint8_t *)cptr;
78    }
79
80    return fold_and_swap(sum, swap);
81}
82