1/*
2 * Written by J.T. Conklin <jtc@acorntoolworks.com>
3 * Public domain.
4 */
5
6#include <machine/asm.h>
7
8#if defined(LIBC_SCCS)
9	RCSID("$NetBSD: strcat.S,v 1.2 2014/03/22 19:38:46 jakllsch Exp $")
10#endif
11
12ENTRY(strcat)
13	pushl	%ebx
14	movl	8(%esp),%ecx
15	movl	12(%esp),%eax
16
17	/*
18	 * Align destination to word boundary.
19	 * Consider unrolling loop?
20	 */
21.Lscan:
22.Lscan_align:
23	testb	$3,%cl
24	je	.Lscan_aligned
25	cmpb	$0,(%ecx)
26	je	.Lcopy
27	incl	%ecx
28	jmp	.Lscan_align
29
30	_ALIGN_TEXT
31.Lscan_aligned:
32.Lscan_loop:
33	movl	(%ecx),%ebx
34	addl	$4,%ecx
35	leal	-0x01010101(%ebx),%edx
36	testl	$0x80808080,%edx
37	je	.Lscan_loop
38
39	/*
40	 * In rare cases, the above loop may exit prematurely. We must
41	 * return to the loop if none of the bytes in the word equal 0.
42	 */
43
44	/*
45	 * The optimal code for determining whether each byte is zero
46	 * differs by processor.  This space-optimized code should be
47	 * acceptable on all, especially since we don't expect it to
48	 * be run frequently,
49	 */
50
51	testb	%bl,%bl		/* 1st byte == 0? */
52	jne	1f
53	subl	$4,%ecx
54	jmp	.Lcopy
55
561:	testb	%bh,%bh		/* 2nd byte == 0? */
57	jne	1f
58	subl	$3,%ecx
59	jmp	.Lcopy
60
611:	shrl	$16,%ebx
62	testb	%bl,%bl		/* 3rd byte == 0? */
63	jne	1f
64	subl	$2,%ecx
65	jmp	.Lcopy
66
671:	testb	%bh,%bh		/* 4th byte == 0? */
68	jne	.Lscan_loop
69	subl	$1,%ecx
70
71	/*
72	 * Align source to a word boundary.
73	 * Consider unrolling loop?
74	 */
75.Lcopy:
76.Lcopy_align:
77	testl	$3,%eax
78	je	.Lcopy_aligned
79	movb	(%eax),%bl
80	incl	%eax
81	movb	%bl,(%ecx)
82	incl	%ecx
83	testb	%bl,%bl
84	jne	.Lcopy_align
85	jmp	.Ldone
86
87	_ALIGN_TEXT
88.Lcopy_loop:
89	movl	%ebx,(%ecx)
90	addl	$4,%ecx
91.Lcopy_aligned:
92	movl	(%eax),%ebx
93	addl	$4,%eax
94	leal	-0x01010101(%ebx),%edx
95	testl	$0x80808080,%edx
96	je	.Lcopy_loop
97
98	/*
99	 * In rare cases, the above loop may exit prematurely. We must
100	 * return to the loop if none of the bytes in the word equal 0.
101	 */
102
103	movb	%bl,(%ecx)
104	incl	%ecx
105	testb	%bl,%bl
106	je	.Ldone
107
108	movb	%bh,(%ecx)
109	incl	%ecx
110	testb	%bh,%bh
111	je	.Ldone
112
113	shrl	$16,%ebx
114	movb	%bl,(%ecx)
115	incl	%ecx
116	testb	%bl,%bl
117	je	.Ldone
118
119	movb	%bh,(%ecx)
120	incl	%ecx
121	testb	%bh,%bh
122	jne	.Lcopy_aligned
123
124.Ldone:
125	movl	8(%esp),%eax
126	popl	%ebx
127	ret
128END(strcat)
129