1/* $NetBSD: in_cksum_arm.S,v 1.2 2003/09/23 10:01:36 scw Exp $ */ 2 3/*- 4 * Copyright 2003 Wasabi Systems, Inc. 5 * All rights reserved. 6 * 7 * Written by Steve C. Woodford for Wasabi Systems, Inc. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 3. All advertising materials mentioning features or use of this software 18 * must display the following acknowledgement: 19 * This product includes software developed for the NetBSD Project by 20 * Wasabi Systems, Inc. 21 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 22 * or promote products derived from this software without specific prior 23 * written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 27 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 28 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 29 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 30 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 31 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 32 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 33 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 34 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 35 * POSSIBILITY OF SUCH DAMAGE. 36 * 37 */ 38 39/* 40 * Hand-optimised in_cksum() and in4_cksum() implementations for ARM/armv5e 41 */ 42 43#include "opt_inet.h" 44 45#include <machine/asm.h> 46#include "assym.s" 47__FBSDID("$FreeBSD$"); 48 49 .syntax unified 50/* 51 * int in_cksum(struct mbuf *m, int len) 52 * 53 * Entry: 54 * r0 m 55 * r1 len 56 * 57 * NOTE: Assumes 'm' is *never* NULL. 58 */ 59/* LINTSTUB: Func: int in_cksum(struct mbuf *, int) */ 60ENTRY(in_cksum) 61 stmfd sp!, {r4-r11,lr} 62 mov r8, #0x00 63 mov r9, r1 64 mov r10, #0x00 65 mov ip, r0 66 67.Lin_cksum_loop: 68 ldr r1, [ip, #(M_LEN)] 69 ldr r0, [ip, #(M_DATA)] 70 ldr ip, [ip, #(M_NEXT)] 71.Lin_cksum_entry4: 72 cmp r9, r1 73 movlt r1, r9 74 sub r9, r9, r1 75 eor r11, r10, r0 76 add r10, r10, r1 77 adds r2, r1, #0x00 78 blne _ASM_LABEL(L_cksumdata) 79 tst r11, #0x01 80 movne r2, r2, ror #8 81 adds r8, r8, r2 82 adc r8, r8, #0x00 83 cmp ip, #0x00 84 bne .Lin_cksum_loop 85 86 mov r1, #0xff 87 orr r1, r1, #0xff00 88 and r0, r8, r1 89 add r0, r0, r8, lsr #16 90 add r0, r0, r0, lsr #16 91 and r0, r0, r1 92 eor r0, r0, r1 93 ldmfd sp!, {r4-r11,pc} 94END(in_cksum) 95 96ENTRY(do_cksum) 97 stmfd sp!, {r4-r7, lr} 98 bl L_cksumdata 99 mov r0, r2 100 ldmfd sp!, {r4-r7, pc} 101END(do_cksum) 102 103/* 104 * The main in*_cksum() workhorse... 105 * 106 * Entry parameters: 107 * r0 Pointer to buffer 108 * r1 Buffer length 109 * lr Return address 110 * 111 * Returns: 112 * r2 Accumulated 32-bit sum 113 * 114 * Clobbers: 115 * r0-r7 116 */ 117/* LINTSTUB: Ignore */ 118ASENTRY_NP(L_cksumdata) 119#ifdef _ARM_ARCH_5E 120 pld [r0] /* Pre-fetch the start of the buffer */ 121#endif 122 mov r2, #0 123 124 /* We first have to word-align the buffer. */ 125 ands r7, r0, #0x03 126 beq .Lcksumdata_wordaligned 127 rsb r7, r7, #0x04 128 cmp r1, r7 /* Enough bytes left to make it? */ 129 blt .Lcksumdata_endgame 130 cmp r7, #0x02 131 ldrb r4, [r0], #0x01 /* Fetch 1st byte */ 132 ldrbge r5, [r0], #0x01 /* Fetch 2nd byte */ 133 movlt r5, #0x00 134 ldrbgt r6, [r0], #0x01 /* Fetch 3rd byte */ 135 movle r6, #0x00 136 /* Combine the three bytes depending on endianness and alignment */ 137#ifdef __ARMEB__ 138 orreq r2, r5, r4, lsl #8 139 orreq r2, r2, r6, lsl #24 140 orrne r2, r4, r5, lsl #8 141 orrne r2, r2, r6, lsl #16 142#else 143 orreq r2, r4, r5, lsl #8 144 orreq r2, r2, r6, lsl #16 145 orrne r2, r5, r4, lsl #8 146 orrne r2, r2, r6, lsl #24 147#endif 148 subs r1, r1, r7 /* Update length */ 149 RETeq /* All done? */ 150 151 /* Buffer is now word aligned */ 152.Lcksumdata_wordaligned: 153#ifdef _ARM_ARCH_5E 154 cmp r1, #0x04 /* Less than 4 bytes left? */ 155 blt .Lcksumdata_endgame /* Yup */ 156 157 /* Now quad-align, if necessary */ 158 ands r7, r0, #0x04 159 ldrne r7, [r0], #0x04 160 subne r1, r1, #0x04 161 subs r1, r1, #0x40 162 blt .Lcksumdata_bigloop_end /* Note: C flag clear if branch taken */ 163 164 /* 165 * Buffer is now quad aligned. Sum 64 bytes at a time. 166 * Note: First ldrd is hoisted above the loop, together with 167 * setting r6 to zero to avoid stalling for results in the 168 * loop. (r7 is live, from above). 169 */ 170 ldrd r4, [r0], #0x08 171 mov r6, #0x00 172.Lcksumdata_bigloop: 173 pld [r0, #0x18] 174 adds r2, r2, r6 175 adcs r2, r2, r7 176 ldrd r6, [r0], #0x08 177 adcs r2, r2, r4 178 adcs r2, r2, r5 179 ldrd r4, [r0], #0x08 180 adcs r2, r2, r6 181 adcs r2, r2, r7 182 ldrd r6, [r0], #0x08 183 adcs r2, r2, r4 184 adcs r2, r2, r5 185 ldrd r4, [r0], #0x08 186 adcs r2, r2, r6 187 adcs r2, r2, r7 188 pld [r0, #0x18] 189 ldrd r6, [r0], #0x08 190 adcs r2, r2, r4 191 adcs r2, r2, r5 192 ldrd r4, [r0], #0x08 193 adcs r2, r2, r6 194 adcs r2, r2, r7 195 ldrd r6, [r0], #0x08 196 adcs r2, r2, r4 197 adcs r2, r2, r5 198 adc r2, r2, #0x00 199 subs r1, r1, #0x40 200 ldrdge r4, [r0], #0x08 201 bge .Lcksumdata_bigloop 202 203 adds r2, r2, r6 /* r6/r7 still need summing */ 204.Lcksumdata_bigloop_end: 205 adcs r2, r2, r7 206 adc r2, r2, #0x00 207 208#else /* !_ARM_ARCH_5E */ 209 210 subs r1, r1, #0x40 211 blt .Lcksumdata_bigloop_end 212 213.Lcksumdata_bigloop: 214 ldmia r0!, {r3, r4, r5, r6} 215 adds r2, r2, r3 216 adcs r2, r2, r4 217 adcs r2, r2, r5 218 ldmia r0!, {r3, r4, r5, r7} 219 adcs r2, r2, r6 220 adcs r2, r2, r3 221 adcs r2, r2, r4 222 adcs r2, r2, r5 223 ldmia r0!, {r3, r4, r5, r6} 224 adcs r2, r2, r7 225 adcs r2, r2, r3 226 adcs r2, r2, r4 227 adcs r2, r2, r5 228 ldmia r0!, {r3, r4, r5, r7} 229 adcs r2, r2, r6 230 adcs r2, r2, r3 231 adcs r2, r2, r4 232 adcs r2, r2, r5 233 adcs r2, r2, r7 234 adc r2, r2, #0x00 235 subs r1, r1, #0x40 236 bge .Lcksumdata_bigloop 237.Lcksumdata_bigloop_end: 238#endif 239 240 adds r1, r1, #0x40 241 RETeq 242 cmp r1, #0x20 243 244#ifdef _ARM_ARCH_5E 245 ldrdge r4, [r0], #0x08 /* Avoid stalling pld and result */ 246 blt .Lcksumdata_less_than_32 247 pld [r0, #0x18] 248 ldrd r6, [r0], #0x08 249 adds r2, r2, r4 250 adcs r2, r2, r5 251 ldrd r4, [r0], #0x08 252 adcs r2, r2, r6 253 adcs r2, r2, r7 254 ldrd r6, [r0], #0x08 255 adcs r2, r2, r4 256 adcs r2, r2, r5 257 adcs r2, r2, r6 /* XXX: Unavoidable result stall */ 258 adcs r2, r2, r7 259#else 260 blt .Lcksumdata_less_than_32 261 ldmia r0!, {r3, r4, r5, r6} 262 adds r2, r2, r3 263 adcs r2, r2, r4 264 adcs r2, r2, r5 265 ldmia r0!, {r3, r4, r5, r7} 266 adcs r2, r2, r6 267 adcs r2, r2, r3 268 adcs r2, r2, r4 269 adcs r2, r2, r5 270 adcs r2, r2, r7 271#endif 272 adc r2, r2, #0x00 273 subs r1, r1, #0x20 274 RETeq 275 276.Lcksumdata_less_than_32: 277 /* There are less than 32 bytes left */ 278 and r3, r1, #0x18 279 rsb r4, r3, #0x18 280 sub r1, r1, r3 281 adds r4, r4, r4, lsr #1 /* Side effect: Clear carry flag */ 282 addne pc, pc, r4 283 nop 284 285/* 286 * Note: We use ldm here, even on armv5e, since the combined issue/result 287 * latencies for ldm and ldrd are the same. Using ldm avoids needless #ifdefs. 288 */ 289 /* At least 24 bytes remaining... */ 290 ldmia r0!, {r4, r5} 291 adcs r2, r2, r4 292 adcs r2, r2, r5 293 294 /* At least 16 bytes remaining... */ 295 ldmia r0!, {r4, r5} 296 adcs r2, r2, r4 297 adcs r2, r2, r5 298 299 /* At least 8 bytes remaining... */ 300 ldmia r0!, {r4, r5} 301 adcs r2, r2, r4 302 adcs r2, r2, r5 303 304 /* Less than 8 bytes remaining... */ 305 adc r2, r2, #0x00 306 subs r1, r1, #0x04 307 blt .Lcksumdata_lessthan4 308 309 ldr r4, [r0], #0x04 310 sub r1, r1, #0x04 311 adds r2, r2, r4 312 adc r2, r2, #0x00 313 314 /* Deal with < 4 bytes remaining */ 315.Lcksumdata_lessthan4: 316 adds r1, r1, #0x04 317 RETeq 318 319 /* Deal with 1 to 3 remaining bytes, possibly misaligned */ 320.Lcksumdata_endgame: 321 ldrb r3, [r0] /* Fetch first byte */ 322 cmp r1, #0x02 323 ldrbge r4, [r0, #0x01] /* Fetch 2nd and 3rd as necessary */ 324 movlt r4, #0x00 325 ldrbgt r5, [r0, #0x02] 326 movle r5, #0x00 327 /* Combine the three bytes depending on endianness and alignment */ 328 tst r0, #0x01 329#ifdef __ARMEB__ 330 orreq r3, r4, r3, lsl #8 331 orreq r3, r3, r5, lsl #24 332 orrne r3, r3, r4, lsl #8 333 orrne r3, r3, r5, lsl #16 334#else 335 orreq r3, r3, r4, lsl #8 336 orreq r3, r3, r5, lsl #16 337 orrne r3, r4, r3, lsl #8 338 orrne r3, r3, r5, lsl #24 339#endif 340 adds r2, r2, r3 341 adc r2, r2, #0x00 342 RET 343END(L_cksumdata) 344 345