dev/ppc/xsumas.s

6  * This file contains Original Code and/or Modifications of Original Code
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
15  * Please obtain a copy of the License at
30 #define cr1_gt  5       // bit 1 of cr1
36  *  r4 - Length of data
38  *  r6 -"Starting on odd address" flag (relative to byte 0 of the checksumed data)
44  * of the data, treated as an array of 16-bit integers.  1s-complement sums are done
52  * is set on the low 32-bits of the sum.)
54  * Using Altivec is tempting, but the performance impact of the greatly increased
55  * number of exceptions and register save/restore traffic probably make it impractical
79         li      r7,-1           ; start of mask for partial fill
84         addi    r3,r3,4         ; point to next word of input
96         srwi.   r0,r4,5         ; get count of 32-byte chunks
142         bf      27,Lleftover8   ; test 0x10 bit of residual length
185         srwi    r6,r2,16        ; top half of 32-bit checksum
203         srwi    r6,r2,16        ; top half of 32-bit checksum
244 ;          ctr = number of 32-byte chunks of input
260         mr      r14,r2          ; just copy incoming partial word into one of the accumulators
268         lwz     r12,28(r3)      ; load last word of previous chunk
275 ;       It is pipelined (loads are one iteration ahead of adds), and unrolled.
276 ;       It should take 9-10 cycles per iteration, which consumes 64 bytes of input.
329 ;           r3 = word aligned address of next byte of data
335 LEarlyExit:                     ; here from middle of inner loop
336         lwz     r12,28(r3)      ; load last word of last chunk
338 LAddLastChunk:                  ; last 32-byte chunk of input is in r4,r6-r12
350         bf      27,Lleft1       ; test 0x10 bit of residual length
385 ;           r3 = word aligned address of next byte of data
396         add     r8,r8,r9        ; now r8 is 64-bit sum of the four accumulators
399         srdi    r7,r8,32        ; get upper half of 64-bit sum
400         addc    r2,r7,r8        ; finally, do a 32-bit add of the two halves of r8 (setting carry)