• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /asuswrt-rt-n18u-9.0.0.4.380.2695/release/src-rt-6.x.4708/linux/linux-2.6/arch/alpha/lib/
1/*
2 * arch/alpha/lib/ev6-stxcpy.S
3 * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
4 *
5 * Copy a null-terminated string from SRC to DST.
6 *
7 * This is an internal routine used by strcpy, stpcpy, and strcat.
8 * As such, it uses special linkage conventions to make implementation
9 * of these public functions more efficient.
10 *
11 * On input:
12 *	t9 = return address
13 *	a0 = DST
14 *	a1 = SRC
15 *
16 * On output:
17 *	t12 = bitmask (with one bit set) indicating the last byte written
18 *	a0  = unaligned address of the last *word* written
19 *
20 * Furthermore, v0, a3-a5, t11, and t12 are untouched.
21 *
22 * Much of the information about 21264 scheduling/coding comes from:
23 *	Compiler Writer's Guide for the Alpha 21264
24 *	abbreviated as 'CWG' in other comments here
25 *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
26 * Scheduling notation:
27 *	E	- either cluster
28 *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
29 *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
30 * Try not to change the actual algorithm if possible for consistency.
31 */
32
33#include <asm/regdef.h>
34
35	.set noat
36	.set noreorder
37
38	.text
39
40
41
42	.ent stxcpy_aligned
43	.align 4
44stxcpy_aligned:
45	.frame sp, 0, t9
46	.prologue 0
47
48	/* On entry to this basic block:
49	   t0 == the first destination word for masking back in
50	   t1 == the first source word.  */
51
52	/* Create the 1st output word and detect 0's in the 1st input word.  */
53	lda	t2, -1		# E : build a mask against false zero
54	mskqh	t2, a1, t2	# U :   detection in the src word (stall)
55	mskqh	t1, a1, t3	# U :
56	ornot	t1, t2, t2	# E : (stall)
57
58	mskql	t0, a1, t0	# U : assemble the first output word
59	cmpbge	zero, t2, t8	# E : bits set iff null found
60	or	t0, t3, t1	# E : (stall)
61	bne	t8, $a_eos	# U : (stall)
62
63	/* On entry to this basic block:
64	   t0 == the first destination word for masking back in
65	   t1 == a source word not containing a null.  */
66	/* Nops here to separate store quads from load quads */
67
68$a_loop:
69	stq_u	t1, 0(a0)	# L :
70	addq	a0, 8, a0	# E :
71	nop
72	nop
73
74	ldq_u	t1, 0(a1)	# L : Latency=3
75	addq	a1, 8, a1	# E :
76	cmpbge	zero, t1, t8	# E : (3 cycle stall)
77	beq	t8, $a_loop	# U : (stall for t8)
78
79	/* Take care of the final (partial) word store.
80	   On entry to this basic block we have:
81	   t1 == the source word containing the null
82	   t8 == the cmpbge mask that found it.  */
83$a_eos:
84	negq	t8, t6		# E : find low bit set
85	and	t8, t6, t12	# E : (stall)
86	/* For the sake of the cache, don't read a destination word
87	   if we're not going to need it.  */
88	and	t12, 0x80, t6	# E : (stall)
89	bne	t6, 1f		# U : (stall)
90
91	/* We're doing a partial word store and so need to combine
92	   our source and original destination words.  */
93	ldq_u	t0, 0(a0)	# L : Latency=3
94	subq	t12, 1, t6	# E :
95	zapnot	t1, t6, t1	# U : clear src bytes >= null (stall)
96	or	t12, t6, t8	# E : (stall)
97
98	zap	t0, t8, t0	# E : clear dst bytes <= null
99	or	t0, t1, t1	# E : (stall)
100	nop
101	nop
102
1031:	stq_u	t1, 0(a0)	# L :
104	ret	(t9)		# L0 : Latency=3
105	nop
106	nop
107
108	.end stxcpy_aligned
109
110	.align 4
111	.ent __stxcpy
112	.globl __stxcpy
113__stxcpy:
114	.frame sp, 0, t9
115	.prologue 0
116
117	/* Are source and destination co-aligned?  */
118	xor	a0, a1, t0	# E :
119	unop			# E :
120	and	t0, 7, t0	# E : (stall)
121	bne	t0, $unaligned	# U : (stall)
122
123	/* We are co-aligned; take care of a partial first word.  */
124	ldq_u	t1, 0(a1)		# L : load first src word
125	and	a0, 7, t0		# E : take care not to load a word ...
126	addq	a1, 8, a1		# E :
127	beq	t0, stxcpy_aligned	# U : ... if we wont need it (stall)
128
129	ldq_u	t0, 0(a0)	# L :
130	br	stxcpy_aligned	# L0 : Latency=3
131	nop
132	nop
133
134
135/* The source and destination are not co-aligned.  Align the destination
136   and cope.  We have to be very careful about not reading too much and
137   causing a SEGV.  */
138
139	.align 4
140$u_head:
141	/* We know just enough now to be able to assemble the first
142	   full source word.  We can still find a zero at the end of it
143	   that prevents us from outputting the whole thing.
144
145	   On entry to this basic block:
146	   t0 == the first dest word, for masking back in, if needed else 0
147	   t1 == the low bits of the first source word
148	   t6 == bytemask that is -1 in dest word bytes */
149
150	ldq_u	t2, 8(a1)	# L :
151	addq	a1, 8, a1	# E :
152	extql	t1, a1, t1	# U : (stall on a1)
153	extqh	t2, a1, t4	# U : (stall on a1)
154
155	mskql	t0, a0, t0	# U :
156	or	t1, t4, t1	# E :
157	mskqh	t1, a0, t1	# U : (stall on t1)
158	or	t0, t1, t1	# E : (stall on t1)
159
160	or	t1, t6, t6	# E :
161	cmpbge	zero, t6, t8	# E : (stall)
162	lda	t6, -1		# E : for masking just below
163	bne	t8, $u_final	# U : (stall)
164
165	mskql	t6, a1, t6		# U : mask out the bits we have
166	or	t6, t2, t2		# E :   already extracted before (stall)
167	cmpbge	zero, t2, t8		# E :   testing eos (stall)
168	bne	t8, $u_late_head_exit	# U : (stall)
169
170	/* Finally, we've got all the stupid leading edge cases taken care
171	   of and we can set up to enter the main loop.  */
172
173	stq_u	t1, 0(a0)	# L : store first output word
174	addq	a0, 8, a0	# E :
175	extql	t2, a1, t0	# U : position ho-bits of lo word
176	ldq_u	t2, 8(a1)	# U : read next high-order source word
177
178	addq	a1, 8, a1	# E :
179	cmpbge	zero, t2, t8	# E : (stall for t2)
180	nop			# E :
181	bne	t8, $u_eos	# U : (stall)
182
183	/* Unaligned copy main loop.  In order to avoid reading too much,
184	   the loop is structured to detect zeros in aligned source words.
185	   This has, unfortunately, effectively pulled half of a loop
186	   iteration out into the head and half into the tail, but it does
187	   prevent nastiness from accumulating in the very thing we want
188	   to run as fast as possible.
189
190	   On entry to this basic block:
191	   t0 == the shifted high-order bits from the previous source word
192	   t2 == the unshifted current source word
193
194	   We further know that t2 does not contain a null terminator.  */
195
196	.align 3
197$u_loop:
198	extqh	t2, a1, t1	# U : extract high bits for current word
199	addq	a1, 8, a1	# E : (stall)
200	extql	t2, a1, t3	# U : extract low bits for next time (stall)
201	addq	a0, 8, a0	# E :
202
203	or	t0, t1, t1	# E : current dst word now complete
204	ldq_u	t2, 0(a1)	# L : Latency=3 load high word for next time
205	stq_u	t1, -8(a0)	# L : save the current word (stall)
206	mov	t3, t0		# E :
207
208	cmpbge	zero, t2, t8	# E : test new word for eos
209	beq	t8, $u_loop	# U : (stall)
210	nop
211	nop
212
213	/* We've found a zero somewhere in the source word we just read.
214	   If it resides in the lower half, we have one (probably partial)
215	   word to write out, and if it resides in the upper half, we
216	   have one full and one partial word left to write out.
217
218	   On entry to this basic block:
219	   t0 == the shifted high-order bits from the previous source word
220	   t2 == the unshifted current source word.  */
221$u_eos:
222	extqh	t2, a1, t1	# U :
223	or	t0, t1, t1	# E : first (partial) source word complete (stall)
224	cmpbge	zero, t1, t8	# E : is the null in this first bit? (stall)
225	bne	t8, $u_final	# U : (stall)
226
227$u_late_head_exit:
228	stq_u	t1, 0(a0)	# L : the null was in the high-order bits
229	addq	a0, 8, a0	# E :
230	extql	t2, a1, t1	# U :
231	cmpbge	zero, t1, t8	# E : (stall)
232
233	/* Take care of a final (probably partial) result word.
234	   On entry to this basic block:
235	   t1 == assembled source word
236	   t8 == cmpbge mask that found the null.  */
237$u_final:
238	negq	t8, t6		# E : isolate low bit set
239	and	t6, t8, t12	# E : (stall)
240	and	t12, 0x80, t6	# E : avoid dest word load if we can (stall)
241	bne	t6, 1f		# U : (stall)
242
243	ldq_u	t0, 0(a0)	# E :
244	subq	t12, 1, t6	# E :
245	or	t6, t12, t8	# E : (stall)
246	zapnot	t1, t6, t1	# U : kill source bytes >= null (stall)
247
248	zap	t0, t8, t0	# U : kill dest bytes <= null (2 cycle data stall)
249	or	t0, t1, t1	# E : (stall)
250	nop
251	nop
252
2531:	stq_u	t1, 0(a0)	# L :
254	ret	(t9)		# L0 : Latency=3
255	nop
256	nop
257
258	/* Unaligned copy entry point.  */
259	.align 4
260$unaligned:
261
262	ldq_u	t1, 0(a1)	# L : load first source word
263	and	a0, 7, t4	# E : find dest misalignment
264	and	a1, 7, t5	# E : find src misalignment
265	/* Conditionally load the first destination word and a bytemask
266	   with 0xff indicating that the destination byte is sacrosanct.  */
267	mov	zero, t0	# E :
268
269	mov	zero, t6	# E :
270	beq	t4, 1f		# U :
271	ldq_u	t0, 0(a0)	# L :
272	lda	t6, -1		# E :
273
274	mskql	t6, a0, t6	# U :
275	nop
276	nop
277	nop
2781:
279	subq	a1, t4, a1	# E : sub dest misalignment from src addr
280	/* If source misalignment is larger than dest misalignment, we need
281	   extra startup checks to avoid SEGV.  */
282	cmplt	t4, t5, t12	# E :
283	beq	t12, $u_head	# U :
284	lda	t2, -1		# E : mask out leading garbage in source
285
286	mskqh	t2, t5, t2	# U :
287	ornot	t1, t2, t3	# E : (stall)
288	cmpbge	zero, t3, t8	# E : is there a zero? (stall)
289	beq	t8, $u_head	# U : (stall)
290
291	/* At this point we've found a zero in the first partial word of
292	   the source.  We need to isolate the valid source data and mask
293	   it into the original destination data.  (Incidentally, we know
294	   that we'll need at least one byte of that original dest word.) */
295
296	ldq_u	t0, 0(a0)	# L :
297	negq	t8, t6		# E : build bitmask of bytes <= zero
298	and	t6, t8, t12	# E : (stall)
299	and	a1, 7, t5	# E :
300
301	subq	t12, 1, t6	# E :
302	or	t6, t12, t8	# E : (stall)
303	srl	t12, t5, t12	# U : adjust final null return value
304	zapnot	t2, t8, t2	# U : prepare source word; mirror changes (stall)
305
306	and	t1, t2, t1	# E : to source validity mask
307	extql	t2, a1, t2	# U :
308	extql	t1, a1, t1	# U : (stall)
309	andnot	t0, t2, t0	# .. e1 : zero place for source to reside (stall)
310
311	or	t0, t1, t1	# e1    : and put it there
312	stq_u	t1, 0(a0)	# .. e0 : (stall)
313	ret	(t9)		# e1    :
314	nop
315
316	.end __stxcpy
317