1/* 2 * arch/alpha/lib/ev6-stxcpy.S 3 * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com> 4 * 5 * Copy a null-terminated string from SRC to DST. 6 * 7 * This is an internal routine used by strcpy, stpcpy, and strcat. 8 * As such, it uses special linkage conventions to make implementation 9 * of these public functions more efficient. 10 * 11 * On input: 12 * t9 = return address 13 * a0 = DST 14 * a1 = SRC 15 * 16 * On output: 17 * t12 = bitmask (with one bit set) indicating the last byte written 18 * a0 = unaligned address of the last *word* written 19 * 20 * Furthermore, v0, a3-a5, t11, and t12 are untouched. 21 * 22 * Much of the information about 21264 scheduling/coding comes from: 23 * Compiler Writer's Guide for the Alpha 21264 24 * abbreviated as 'CWG' in other comments here 25 * ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html 26 * Scheduling notation: 27 * E - either cluster 28 * U - upper subcluster; U0 - subcluster U0; U1 - subcluster U1 29 * L - lower subcluster; L0 - subcluster L0; L1 - subcluster L1 30 * Try not to change the actual algorithm if possible for consistency. 31 */ 32 33#include <asm/regdef.h> 34 35 .set noat 36 .set noreorder 37 38 .text 39 40 41 42 .ent stxcpy_aligned 43 .align 4 44stxcpy_aligned: 45 .frame sp, 0, t9 46 .prologue 0 47 48 /* On entry to this basic block: 49 t0 == the first destination word for masking back in 50 t1 == the first source word. */ 51 52 /* Create the 1st output word and detect 0's in the 1st input word. */ 53 lda t2, -1 # E : build a mask against false zero 54 mskqh t2, a1, t2 # U : detection in the src word (stall) 55 mskqh t1, a1, t3 # U : 56 ornot t1, t2, t2 # E : (stall) 57 58 mskql t0, a1, t0 # U : assemble the first output word 59 cmpbge zero, t2, t8 # E : bits set iff null found 60 or t0, t3, t1 # E : (stall) 61 bne t8, $a_eos # U : (stall) 62 63 /* On entry to this basic block: 64 t0 == the first destination word for masking back in 65 t1 == a source word not containing a null. */ 66 /* Nops here to separate store quads from load quads */ 67 68$a_loop: 69 stq_u t1, 0(a0) # L : 70 addq a0, 8, a0 # E : 71 nop 72 nop 73 74 ldq_u t1, 0(a1) # L : Latency=3 75 addq a1, 8, a1 # E : 76 cmpbge zero, t1, t8 # E : (3 cycle stall) 77 beq t8, $a_loop # U : (stall for t8) 78 79 /* Take care of the final (partial) word store. 80 On entry to this basic block we have: 81 t1 == the source word containing the null 82 t8 == the cmpbge mask that found it. */ 83$a_eos: 84 negq t8, t6 # E : find low bit set 85 and t8, t6, t12 # E : (stall) 86 /* For the sake of the cache, don't read a destination word 87 if we're not going to need it. */ 88 and t12, 0x80, t6 # E : (stall) 89 bne t6, 1f # U : (stall) 90 91 /* We're doing a partial word store and so need to combine 92 our source and original destination words. */ 93 ldq_u t0, 0(a0) # L : Latency=3 94 subq t12, 1, t6 # E : 95 zapnot t1, t6, t1 # U : clear src bytes >= null (stall) 96 or t12, t6, t8 # E : (stall) 97 98 zap t0, t8, t0 # E : clear dst bytes <= null 99 or t0, t1, t1 # E : (stall) 100 nop 101 nop 102 1031: stq_u t1, 0(a0) # L : 104 ret (t9) # L0 : Latency=3 105 nop 106 nop 107 108 .end stxcpy_aligned 109 110 .align 4 111 .ent __stxcpy 112 .globl __stxcpy 113__stxcpy: 114 .frame sp, 0, t9 115 .prologue 0 116 117 /* Are source and destination co-aligned? */ 118 xor a0, a1, t0 # E : 119 unop # E : 120 and t0, 7, t0 # E : (stall) 121 bne t0, $unaligned # U : (stall) 122 123 /* We are co-aligned; take care of a partial first word. */ 124 ldq_u t1, 0(a1) # L : load first src word 125 and a0, 7, t0 # E : take care not to load a word ... 126 addq a1, 8, a1 # E : 127 beq t0, stxcpy_aligned # U : ... if we wont need it (stall) 128 129 ldq_u t0, 0(a0) # L : 130 br stxcpy_aligned # L0 : Latency=3 131 nop 132 nop 133 134 135/* The source and destination are not co-aligned. Align the destination 136 and cope. We have to be very careful about not reading too much and 137 causing a SEGV. */ 138 139 .align 4 140$u_head: 141 /* We know just enough now to be able to assemble the first 142 full source word. We can still find a zero at the end of it 143 that prevents us from outputting the whole thing. 144 145 On entry to this basic block: 146 t0 == the first dest word, for masking back in, if needed else 0 147 t1 == the low bits of the first source word 148 t6 == bytemask that is -1 in dest word bytes */ 149 150 ldq_u t2, 8(a1) # L : 151 addq a1, 8, a1 # E : 152 extql t1, a1, t1 # U : (stall on a1) 153 extqh t2, a1, t4 # U : (stall on a1) 154 155 mskql t0, a0, t0 # U : 156 or t1, t4, t1 # E : 157 mskqh t1, a0, t1 # U : (stall on t1) 158 or t0, t1, t1 # E : (stall on t1) 159 160 or t1, t6, t6 # E : 161 cmpbge zero, t6, t8 # E : (stall) 162 lda t6, -1 # E : for masking just below 163 bne t8, $u_final # U : (stall) 164 165 mskql t6, a1, t6 # U : mask out the bits we have 166 or t6, t2, t2 # E : already extracted before (stall) 167 cmpbge zero, t2, t8 # E : testing eos (stall) 168 bne t8, $u_late_head_exit # U : (stall) 169 170 /* Finally, we've got all the stupid leading edge cases taken care 171 of and we can set up to enter the main loop. */ 172 173 stq_u t1, 0(a0) # L : store first output word 174 addq a0, 8, a0 # E : 175 extql t2, a1, t0 # U : position ho-bits of lo word 176 ldq_u t2, 8(a1) # U : read next high-order source word 177 178 addq a1, 8, a1 # E : 179 cmpbge zero, t2, t8 # E : (stall for t2) 180 nop # E : 181 bne t8, $u_eos # U : (stall) 182 183 /* Unaligned copy main loop. In order to avoid reading too much, 184 the loop is structured to detect zeros in aligned source words. 185 This has, unfortunately, effectively pulled half of a loop 186 iteration out into the head and half into the tail, but it does 187 prevent nastiness from accumulating in the very thing we want 188 to run as fast as possible. 189 190 On entry to this basic block: 191 t0 == the shifted high-order bits from the previous source word 192 t2 == the unshifted current source word 193 194 We further know that t2 does not contain a null terminator. */ 195 196 .align 3 197$u_loop: 198 extqh t2, a1, t1 # U : extract high bits for current word 199 addq a1, 8, a1 # E : (stall) 200 extql t2, a1, t3 # U : extract low bits for next time (stall) 201 addq a0, 8, a0 # E : 202 203 or t0, t1, t1 # E : current dst word now complete 204 ldq_u t2, 0(a1) # L : Latency=3 load high word for next time 205 stq_u t1, -8(a0) # L : save the current word (stall) 206 mov t3, t0 # E : 207 208 cmpbge zero, t2, t8 # E : test new word for eos 209 beq t8, $u_loop # U : (stall) 210 nop 211 nop 212 213 /* We've found a zero somewhere in the source word we just read. 214 If it resides in the lower half, we have one (probably partial) 215 word to write out, and if it resides in the upper half, we 216 have one full and one partial word left to write out. 217 218 On entry to this basic block: 219 t0 == the shifted high-order bits from the previous source word 220 t2 == the unshifted current source word. */ 221$u_eos: 222 extqh t2, a1, t1 # U : 223 or t0, t1, t1 # E : first (partial) source word complete (stall) 224 cmpbge zero, t1, t8 # E : is the null in this first bit? (stall) 225 bne t8, $u_final # U : (stall) 226 227$u_late_head_exit: 228 stq_u t1, 0(a0) # L : the null was in the high-order bits 229 addq a0, 8, a0 # E : 230 extql t2, a1, t1 # U : 231 cmpbge zero, t1, t8 # E : (stall) 232 233 /* Take care of a final (probably partial) result word. 234 On entry to this basic block: 235 t1 == assembled source word 236 t8 == cmpbge mask that found the null. */ 237$u_final: 238 negq t8, t6 # E : isolate low bit set 239 and t6, t8, t12 # E : (stall) 240 and t12, 0x80, t6 # E : avoid dest word load if we can (stall) 241 bne t6, 1f # U : (stall) 242 243 ldq_u t0, 0(a0) # E : 244 subq t12, 1, t6 # E : 245 or t6, t12, t8 # E : (stall) 246 zapnot t1, t6, t1 # U : kill source bytes >= null (stall) 247 248 zap t0, t8, t0 # U : kill dest bytes <= null (2 cycle data stall) 249 or t0, t1, t1 # E : (stall) 250 nop 251 nop 252 2531: stq_u t1, 0(a0) # L : 254 ret (t9) # L0 : Latency=3 255 nop 256 nop 257 258 /* Unaligned copy entry point. */ 259 .align 4 260$unaligned: 261 262 ldq_u t1, 0(a1) # L : load first source word 263 and a0, 7, t4 # E : find dest misalignment 264 and a1, 7, t5 # E : find src misalignment 265 /* Conditionally load the first destination word and a bytemask 266 with 0xff indicating that the destination byte is sacrosanct. */ 267 mov zero, t0 # E : 268 269 mov zero, t6 # E : 270 beq t4, 1f # U : 271 ldq_u t0, 0(a0) # L : 272 lda t6, -1 # E : 273 274 mskql t6, a0, t6 # U : 275 nop 276 nop 277 nop 2781: 279 subq a1, t4, a1 # E : sub dest misalignment from src addr 280 /* If source misalignment is larger than dest misalignment, we need 281 extra startup checks to avoid SEGV. */ 282 cmplt t4, t5, t12 # E : 283 beq t12, $u_head # U : 284 lda t2, -1 # E : mask out leading garbage in source 285 286 mskqh t2, t5, t2 # U : 287 ornot t1, t2, t3 # E : (stall) 288 cmpbge zero, t3, t8 # E : is there a zero? (stall) 289 beq t8, $u_head # U : (stall) 290 291 /* At this point we've found a zero in the first partial word of 292 the source. We need to isolate the valid source data and mask 293 it into the original destination data. (Incidentally, we know 294 that we'll need at least one byte of that original dest word.) */ 295 296 ldq_u t0, 0(a0) # L : 297 negq t8, t6 # E : build bitmask of bytes <= zero 298 and t6, t8, t12 # E : (stall) 299 and a1, 7, t5 # E : 300 301 subq t12, 1, t6 # E : 302 or t6, t12, t8 # E : (stall) 303 srl t12, t5, t12 # U : adjust final null return value 304 zapnot t2, t8, t2 # U : prepare source word; mirror changes (stall) 305 306 and t1, t2, t1 # E : to source validity mask 307 extql t2, a1, t2 # U : 308 extql t1, a1, t1 # U : (stall) 309 andnot t0, t2, t0 # .. e1 : zero place for source to reside (stall) 310 311 or t0, t1, t1 # e1 : and put it there 312 stq_u t1, 0(a0) # .. e0 : (stall) 313 ret (t9) # e1 : 314 nop 315 316 .end __stxcpy 317