1/* Optimized strlen implementation for PowerPC. 2 Copyright (C) 1997, 1999, 2000 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 5 The GNU C Library is free software; you can redistribute it and/or 6 modify it under the terms of the GNU Lesser General Public 7 License as published by the Free Software Foundation; either 8 version 2.1 of the License, or (at your option) any later version. 9 10 The GNU C Library is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 Lesser General Public License for more details. 14 15 You should have received a copy of the GNU Lesser General Public 16 License along with the GNU C Library; if not, write to the Free 17 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 18 02111-1307 USA. */ 19 20#include <sysdep.h> 21#include <bp-sym.h> 22#include <bp-asm.h> 23 24/* The algorithm here uses the following techniques: 25 26 1) Given a word 'x', we can test to see if it contains any 0 bytes 27 by subtracting 0x01010101, and seeing if any of the high bits of each 28 byte changed from 0 to 1. This works because the least significant 29 0 byte must have had no incoming carry (otherwise it's not the least 30 significant), so it is 0x00 - 0x01 == 0xff. For all other 31 byte values, either they have the high bit set initially, or when 32 1 is subtracted you get a value in the range 0x00-0x7f, none of which 33 have their high bit set. The expression here is 34 (x + 0xfefefeff) & ~(x | 0x7f7f7f7f), which gives 0x00000000 when 35 there were no 0x00 bytes in the word. 36 37 2) Given a word 'x', we can test to see _which_ byte was zero by 38 calculating ~(((x & 0x7f7f7f7f) + 0x7f7f7f7f) | x | 0x7f7f7f7f). 39 This produces 0x80 in each byte that was zero, and 0x00 in all 40 the other bytes. The '| 0x7f7f7f7f' clears the low 7 bits in each 41 byte, and the '| x' part ensures that bytes with the high bit set 42 produce 0x00. The addition will carry into the high bit of each byte 43 iff that byte had one of its low 7 bits set. We can then just see 44 which was the most significant bit set and divide by 8 to find how 45 many to add to the index. 46 This is from the book 'The PowerPC Compiler Writer's Guide', 47 by Steve Hoxey, Faraydon Karim, Bill Hay and Hank Warren. 48 49 We deal with strings not aligned to a word boundary by taking the 50 first word and ensuring that bytes not part of the string 51 are treated as nonzero. To allow for memory latency, we unroll the 52 loop a few times, being careful to ensure that we do not read ahead 53 across cache line boundaries. 54 55 Questions to answer: 56 1) How long are strings passed to strlen? If they're often really long, 57 we should probably use cache management instructions and/or unroll the 58 loop more. If they're often quite short, it might be better to use 59 fact (2) in the inner loop than have to recalculate it. 60 2) How popular are bytes with the high bit set? If they are very rare, 61 on some processors it might be useful to use the simpler expression 62 ~((x - 0x01010101) | 0x7f7f7f7f) (that is, on processors with only one 63 ALU), but this fails when any character has its high bit set. */ 64 65/* Some notes on register usage: Under the SVR4 ABI, we can use registers 66 0 and 3 through 12 (so long as we don't call any procedures) without 67 saving them. We can also use registers 14 through 31 if we save them. 68 We can't use r1 (it's the stack pointer), r2 nor r13 because the user 69 program may expect them to hold their usual value if we get sent 70 a signal. Integer parameters are passed in r3 through r10. 71 We can use condition registers cr0, cr1, cr5, cr6, and cr7 without saving 72 them, the others we must save. */ 73 74/* int [r3] strlen (char *s [r3]) */ 75 76ENTRY (BP_SYM (strlen)) 77 78#define rTMP1 r0 79#define rRTN r3 /* incoming STR arg, outgoing result */ 80#define rSTR r4 /* current string position */ 81#define rPADN r5 /* number of padding bits we prepend to the 82 string to make it start at a word boundary */ 83#define rFEFE r6 /* constant 0xfefefeff (-0x01010101) */ 84#define r7F7F r7 /* constant 0x7f7f7f7f */ 85#define rWORD1 r8 /* current string word */ 86#define rWORD2 r9 /* next string word */ 87#define rMASK r9 /* mask for first string word */ 88#define rTMP2 r10 89#define rTMP3 r11 90#define rTMP4 r12 91 92 CHECK_BOUNDS_LOW (rRTN, rTMP1, rTMP2) 93 94 clrrwi rSTR, rRTN, 2 95 lis r7F7F, 0x7f7f 96 rlwinm rPADN, rRTN, 3, 27, 28 97 lwz rWORD1, 0(rSTR) 98 li rMASK, -1 99 addi r7F7F, r7F7F, 0x7f7f 100/* That's the setup done, now do the first pair of words. 101 We make an exception and use method (2) on the first two words, to reduce 102 overhead. */ 103 srw rMASK, rMASK, rPADN 104 and rTMP1, r7F7F, rWORD1 105 or rTMP2, r7F7F, rWORD1 106 add rTMP1, rTMP1, r7F7F 107 nor rTMP1, rTMP2, rTMP1 108 and. rWORD1, rTMP1, rMASK 109 mtcrf 0x01, rRTN 110 bne L(done0) 111 lis rFEFE, -0x101 112 addi rFEFE, rFEFE, -0x101 113/* Are we now aligned to a doubleword boundary? */ 114 bt 29, L(loop) 115 116/* Handle second word of pair. */ 117 lwzu rWORD1, 4(rSTR) 118 and rTMP1, r7F7F, rWORD1 119 or rTMP2, r7F7F, rWORD1 120 add rTMP1, rTMP1, r7F7F 121 nor. rWORD1, rTMP2, rTMP1 122 bne L(done0) 123 124/* The loop. */ 125 126L(loop): 127 lwz rWORD1, 4(rSTR) 128 lwzu rWORD2, 8(rSTR) 129 add rTMP1, rFEFE, rWORD1 130 nor rTMP2, r7F7F, rWORD1 131 and. rTMP1, rTMP1, rTMP2 132 add rTMP3, rFEFE, rWORD2 133 nor rTMP4, r7F7F, rWORD2 134 bne L(done1) 135 and. rTMP1, rTMP3, rTMP4 136 beq L(loop) 137 138 and rTMP1, r7F7F, rWORD2 139 add rTMP1, rTMP1, r7F7F 140 andc rWORD1, rTMP4, rTMP1 141 b L(done0) 142 143L(done1): 144 and rTMP1, r7F7F, rWORD1 145 subi rSTR, rSTR, 4 146 add rTMP1, rTMP1, r7F7F 147 andc rWORD1, rTMP2, rTMP1 148 149/* When we get to here, rSTR points to the first word in the string that 150 contains a zero byte, and the most significant set bit in rWORD1 is in that 151 byte. */ 152L(done0): 153 cntlzw rTMP3, rWORD1 154 subf rTMP1, rRTN, rSTR 155 srwi rTMP3, rTMP3, 3 156 add rRTN, rTMP1, rTMP3 157 /* GKM FIXME: check high bound. */ 158 blr 159END (BP_SYM (strlen)) 160