1/*
2 * Copyright (c) 2023 The FreeBSD Foundation
3 *
4 * This software was developed by Robert Clausecker <fuz@FreeBSD.org>
5 * under sponsorship from the FreeBSD Foundation.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 *    notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 *    notice, this list of conditions and the following disclaimer in the
14 *    documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ''AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE
27 */
28
29#include <machine/asm.h>
30
31#include "amd64_archlevel.h"
32
33#define ALIGN_TEXT	.p2align 4, 0x90
34
35	.weak stpncpy
36	.set stpncpy, __stpncpy
37ARCHFUNCS(__stpncpy)
38	ARCHFUNC(__stpncpy, scalar)
39	ARCHFUNC(__stpncpy, baseline)
40ENDARCHFUNCS(__stpncpy)
41
42ARCHENTRY(__stpncpy, scalar)
43	push	%rbp		# establish stack frame
44	mov	%rsp, %rbp
45
46	push	%rdx
47	push	%rdi
48	push	%rsi
49	push	%rax		# dummy push for alignment
50
51	mov	%rsi, %rdi
52	xor	%esi, %esi
53	call	CNAME(__memchr)	# memchr(src, '\0', len)
54	pop	%rcx		# dummy pop
55	pop	%rsi
56	mov	-16(%rbp), %rdi
57
58	test	%rax, %rax	# NUL found?
59	jz	.Lfullcopy
60
61	mov	%rax, %rdx
62	sub	%rsi, %rdx	# copy until the NUL byte
63	add	%rdx, -16(%rbp)	# advance destination by string length
64	sub	%rdx, -8(%rbp)	# and shorten buffer size by string length
65	call	CNAME(memcpy)
66
67	pop	%rdi
68	pop	%rdx
69	xor	%esi, %esi
70	pop	%rbp
71	jmp	CNAME(memset)	# clear remaining buffer
72
73.Lfullcopy:
74	mov	-8(%rbp), %rdx
75	call	CNAME(memcpy)	# copy whole string
76	add	-8(%rbp), %rax	# point to dest[n]
77	leave
78	ret
79ARCHEND(__stpncpy, scalar)
80
81	/*
82	 * this mask allows us to generate masks of 16-n 0xff bytes
83	 * followed by n 0x00 bytes by loading from .Lmask+n.
84	 */
85	.section	.rodata
86.Lmask:	.quad		0xffffffffffffffff
87	.quad		0xffffffffffffffff
88	.quad		0x0000000000000000
89	.quad		0x0000000000000000
90
91/* stpncpy(char *restrict rdi, const char *rsi, size_t rdx) */
92ARCHENTRY(__stpncpy, baseline)
93#define bounce		(-3*16-8)		/* location of on-stack bounce buffer */
94
95	test		%rdx, %rdx		# no bytes to copy?
96	jz		.L0
97
98	mov		%esi, %ecx
99	and		$~0xf, %rsi		# align source to 16 bytes
100	movdqa		(%rsi), %xmm0		# load head
101	and		$0xf, %ecx		# offset from alignment
102	mov		$-1, %r9d
103	lea		-32(%rcx), %rax		# set up overflow-proof comparison rdx+rcx<=32
104	shl		%cl, %r9d		# mask of bytes belonging to the string
105	sub		%rcx, %rdi		# adjust RDI to correspond to RSI
106	pxor		%xmm1, %xmm1
107	movdqa		%xmm0, bounce(%rsp)	# stash copy of head on the stack
108	pcmpeqb		%xmm1, %xmm0
109	pmovmskb	%xmm0, %r8d
110
111	lea		(%rdx, %rcx, 1), %r10	# buffer length from alignment boundary
112	add		%rdx, %rax		# less than 2 chunks (32 bytes) to play with?
113	jnc		.Lrunt			# if yes, use special runt processing
114
115	movdqu		%xmm1, -16(%rdi, %r10, 1) # clear final bytes of destination
116	and		%r9d, %r8d		# end of string within head?
117	jnz		.Lheadnul
118
119	movdqu		(%rsi, %rcx, 1), %xmm2	# load head from source buffer
120	movdqu		%xmm2, (%rdi, %rcx, 1)	# an deposit
121
122	add		$16, %rsi
123	add		$16, %rdi
124	sub		$32, %r10
125
126	/* main loop unrolled twice */
127	ALIGN_TEXT
1280:	movdqa		(%rsi), %xmm0
129	pxor		%xmm1, %xmm1
130	pcmpeqb		%xmm0, %xmm1		# NUL byte encountered?
131	pmovmskb	%xmm1, %r8d
132	test		%r8d, %r8d
133	jnz		3f
134
135	movdqu		%xmm0, (%rdi)
136	cmp		$16, %r10		# more than a full chunk left?
137	jbe		1f
138
139	movdqa		16(%rsi), %xmm0
140	add		$32, %rdi		# advance pointers to next chunk
141	add		$32, %rsi
142	pxor		%xmm1, %xmm1
143	pcmpeqb		%xmm0, %xmm1		# NUL byte encountered?
144	pmovmskb	%xmm1, %r8d
145	test		%r8d, %r8d
146	jnz		2f
147
148	movdqu		%xmm0, -16(%rdi)
149	sub		$32, %r10		# more than another full chunk left?
150	ja		0b
151
152	sub		$16, %rdi		# undo second advancement
153	sub		$16, %rsi
154	add		$16, %r10d		# restore number of remaining bytes
155
156	/* 1--16 bytes left but string has not ended yet */
1571:	pxor		%xmm1, %xmm1
158	pcmpeqb		16(%rsi), %xmm1		# NUL byte in source tail?
159	pmovmskb	%xmm1, %r8d
160	bts		%r10d, %r8d		# treat end of buffer as NUL
161	tzcnt		%r8d, %r8d		# where is the NUL byte?
162	movdqu		(%rsi, %r8, 1), %xmm0	# load source tail before NUL
163	lea		16(%rdi, %r8, 1), %rax	# point return value to NUL byte
164						# or end of buffer
165	movdqu		%xmm0, (%rdi, %r8, 1)	# store tail into the buffer
166	ret
167
1682:	sub		$16, %rdi		# undo second advancement
169	sub		$16, %rsi
170	sub		$16, %r10
171
172	/* string has ended and buffer has not */
1733:	tzcnt		%r8d, %r8d		# where did the string end?
174	lea		.Lmask+16(%rip), %rcx
175	lea		(%rdi, %r8, 1), %rax 	# where the NUL byte will be
176	neg		%r8
177	movdqu		(%rcx, %r8, 1), %xmm1	# mask with FF where the string is,
178						# 00 where it is not
179	pand		%xmm1, %xmm0		# mask out bytes after the string
180	movdqu		%xmm0, (%rdi)	 	# store masked current chunk
181	pxor		%xmm1, %xmm1
182	sub		$16, %r10		# another full chunk left?
183	jbe		1f
184
185	/* clear remaining destination buffer (tail has been cleared earlier) */
186	ALIGN_TEXT
1870:	movdqu		%xmm1, 16(%rdi)
188	cmp		$16, %r10
189	jbe		1f
190
191	movdqu		%xmm1, 32(%rdi)
192	add		$32, %rdi
193	sub		$32, %r10
194	ja		0b
195
1961:	ret
197
198	/* at least two chunks to play with and NUL while processing head */
199.Lheadnul:
200	movdqu		bounce(%rsp, %rcx, 1), %xmm0 # load start of source from stack
201	tzcnt		%r8d, %r8d		# find location of NUL byte
202	movdqu		%xmm0, (%rdi, %rcx, 1)	# deposit head in the destination
203	movdqu		%xmm1, (%rdi, %r8, 1)	# clear out following bytes
204	movdqu		%xmm1, 16(%rdi)		# clear out second chunk
205	lea		(%rdi, %r8, 1), %rax	# make RAX point to the NUL byte
206
207	add		$32, %rdi		# advance past first two chunks
208	sub		$32+16, %r10		# advance past first three chunks
209	jbe		1f			# did we pass the end of the buffer?
210
211	/* clear remaining destination buffer (tail has been cleared earlier) */
212	ALIGN_TEXT
2130:	movdqu		%xmm1, (%rdi)		# clear out buffer chunk
214	cmp		$16, %r10
215	jbe		1f
216
217	movdqu		%xmm1, 16(%rdi)
218	add		$32, %rdi
219	sub		$32, %r10
220	ja		0b
221
2221:	ret
223
224	/* 1--32 bytes to copy, bounce through the stack */
225.Lrunt:	movdqa		%xmm1, bounce+16(%rsp)	# clear out rest of on-stack copy
226	bts		%r10d, %r8d		# treat end of buffer as end of string
227	and		%r9w, %r8w		# end of string within first buffer?
228	jnz		0f			# if yes, do not inspect second buffer
229
230	movdqa		16(%rsi), %xmm0		# load second chunk of input
231	movdqa		%xmm0, bounce+16(%rsp)	# stash copy on stack
232	pcmpeqb		%xmm1, %xmm0		# NUL in second chunk?
233	pmovmskb	%xmm0, %r9d
234	shl		$16, %r9d
235	or		%r9d, %r8d		# merge found NUL bytes into NUL mask
236
237	/* end of string after one buffer */
2380:	tzcnt		%r8d, %r8d		# location of last char in string
239	movdqu		%xmm1, bounce(%rsp, %r8, 1) # clear bytes behind string
240	lea		bounce(%rsp, %rcx, 1), %rsi # start of string copy on stack
241	lea		(%rdi, %r8, 1), %rax	# return pointer to NUL byte
242
243	cmp		$16, %edx		# at least 16 bytes to transfer?
244	jae		.L1631
245
246	mov		(%rsi), %r8		# load string head
247	cmp		$8, %edx		# at least 8 bytes to transfer?
248	jae		.L0815
249
250	cmp		$4, %edx		# at least 4 bytes to transfer?
251	jae		.L0407
252
253	movzwl		-2(%rsi, %rdx, 1), %esi	# load last two bytes of string
254	mov		%r8b, (%rdi, %rcx, 1)	# store first byte
255
256	cmp		$2, %edx		# at least 2 bytes to transfer?
257	jb		.L1
258
259	mov		%si, -2(%rdi, %r10, 1)	# store last two bytes of string
260.L1:	ret
261
262.L1631:	movdqu		(%rsi), %xmm0		# load first 16 bytes of string
263	movdqu		-16(%rsi, %rdx, 1), %xmm1 # load last 16 bytes of string
264	movdqu		%xmm0, (%rdi, %rcx, 1)
265	movdqu		%xmm1, -16(%rdi, %r10, 1)
266	ret
267
268.L0815:	mov		-8(%rsi, %rdx, 1), %rdx	# load last 8 bytes of string
269	mov		%r8, (%rdi, %rcx, 1)
270	mov		%rdx, -8(%rdi, %r10, 1)
271	ret
272
273.L0407:	mov		-4(%rsi, %rdx, 1), %edx	# load last four bytes of string
274	mov		%r8d, (%rdi, %rcx, 1)
275	mov		%edx, -4(%rdi, %r10, 1)
276	ret
277
278	/* length 0 buffer: just return dest */
279.L0:	mov		%rdi, %rax
280	ret
281ARCHEND(__stpncpy, baseline)
282
283	.section .note.GNU-stack,"",%progbits
284