/*- * Copyright (c) 2023, The FreeBSD Foundation * * SPDX-License-Expression: BSD-2-Clause * * Portions of this software were developed by Robert Clausecker * under sponsorship from the FreeBSD Foundation. * * Adapted from NetBSD's common/lib/libc/arch/x86_64/string/strcmp.S * written by J.T. Conklin that was originally * dedicated to the public domain. */ #include #include #if 0 RCSID("$NetBSD: strcmp.S,v 1.3 2004/07/19 20:04:41 drochner Exp $") #endif #include "amd64_archlevel.h" #define ALIGN_TEXT .p2align 4, 0x90 ARCHFUNCS(strcmp) ARCHFUNC(strcmp, scalar) ARCHFUNC(strcmp, baseline) ENDARCHFUNCS(strcmp) ARCHENTRY(strcmp, scalar) /* * Align s1 to word boundary. * Consider unrolling loop? */ .Ls1align: testb $7,%dil je .Ls1aligned movb (%rdi),%al incq %rdi movb (%rsi),%dl incq %rsi testb %al,%al je .Ldone cmpb %al,%dl je .Ls1align jmp .Ldone /* * Check whether s2 is aligned to a word boundary. If it is, we * can compare by words. Otherwise we have to compare by bytes. */ .Ls1aligned: testb $7,%sil jne .Lbyte_loop movabsq $0x0101010101010101,%r8 subq $8,%rdi movabsq $0x8080808080808080,%r9 subq $8,%rsi ALIGN_TEXT .Lword_loop: movq 8(%rdi),%rax addq $8,%rdi movq 8(%rsi),%rdx addq $8,%rsi cmpq %rax,%rdx jne .Lbyte_loop subq %r8,%rdx notq %rax andq %rax,%rdx testq %r9,%rdx je .Lword_loop ALIGN_TEXT .Lbyte_loop: movb (%rdi),%al incq %rdi movb (%rsi),%dl incq %rsi testb %al,%al je .Ldone cmpb %al,%dl je .Lbyte_loop .Ldone: movzbq %al,%rax movzbq %dl,%rdx subq %rdx,%rax ret ARCHEND(strcmp, scalar) ARCHENTRY(strcmp, baseline) /* check if either string crosses a page in the head */ lea 15(%rdi), %r8d # end of head lea 15(%rsi), %r9d mov %edi, %eax mov %esi, %edx xor %edi, %r8d # bits that changed between first and last byte xor %esi, %r9d and $~0xf, %rdi # align heads to 16 bytes and $~0xf, %rsi or %r8d, %r9d # in either RSI or RDI and $0xf, %eax # offset from alignment and $0xf, %edx pxor %xmm1, %xmm1 test $PAGE_SIZE, %r9d # did the page change? jz 0f # if not, take fast path /* heads may cross page boundary, avoid unmapped loads */ movdqa (%rdi), %xmm0 # load aligned heads movdqa (%rsi), %xmm2 mov $-1, %r8d mov $-1, %r9d mov %eax, %ecx shl %cl, %r8d # string head in XMM0 mov %edx, %ecx shl %cl, %r9d # string head in XMM2 movdqa %xmm0, -40(%rsp) # stash copies of the heads on the stack movdqa %xmm2, -24(%rsp) pcmpeqb %xmm1, %xmm0 pcmpeqb %xmm1, %xmm2 pmovmskb %xmm0, %r10d pmovmskb %xmm2, %r11d test %r8d, %r10d # NUL byte present in first string? lea -40(%rsp), %r8 cmovz %rdi, %r8 test %r9d, %r11d # NUL byte present in second string? lea -24(%rsp), %r9 cmovz %rsi, %r9 movdqu (%r8, %rax, 1), %xmm0 # load true (or fake) heads movdqu (%r9, %rdx, 1), %xmm4 jmp 1f 0: movdqu (%rdi, %rax, 1), %xmm0 # load true heads movdqu (%rsi, %rdx, 1), %xmm4 1: pxor %xmm2, %xmm2 pcmpeqb %xmm0, %xmm2 # NUL byte present? pcmpeqb %xmm0, %xmm4 # which bytes match? pandn %xmm4, %xmm2 # match and not NUL byte? pmovmskb %xmm2, %r9d xor $0xffff, %r9d # mismatch or NUL byte? jnz .Lhead_mismatch /* load head and second chunk */ movdqa 16(%rdi), %xmm2 # load second chunks movdqa 16(%rsi), %xmm3 sub %rdx, %rax # is a&0xf >= b&0xf? jb .Lswapped # if not, proceed with swapped operands neg %rax movdqu 16(%rsi, %rax, 1), %xmm0 sub %rdi, %rsi # express RSI as distance from RDI lea (%rsi, %rax, 1), %rdx # point RDX to offset in second string neg %rax pcmpeqb %xmm3, %xmm1 # ... corresponding to RDI pcmpeqb %xmm2, %xmm0 pmovmskb %xmm1, %r8d pmovmskb %xmm0, %r9d add $16, %rdi test %r8d, %r8d jnz .Lnul_found xor $0xffff, %r9d jnz .Lmismatch add $16, %rdi # advance aligned pointers /* * During the main loop, the layout of the two strings is something like: * * v ------1------ v ------2------ v * RDI: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... * RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... * * where v indicates the alignment boundaries and corresponding chunks * of the strings have the same letters. Chunk A has been checked in * the previous iteration. This iteration, we first check that string * RSI doesn't end within region 2, then we compare chunk B between the * two strings. As RSI is known not to hold a NUL byte in regsions 1 * and 2 at this point, this also ensures that RDI has not ended yet. */ ALIGN_TEXT 0: movdqu (%rdi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI? pxor %xmm1, %xmm1 pcmpeqb (%rdi, %rsi, 1), %xmm1 # end of string in RSI? pcmpeqb (%rdi), %xmm0 # where do the chunks match? pmovmskb %xmm1, %r8d pmovmskb %xmm0, %r9d test %r8d, %r8d jnz .Lnul_found xor $0xffff, %r9d # any mismatches? jnz .Lmismatch /* main loop unrolled twice */ movdqu 16(%rdi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI? pxor %xmm1, %xmm1 pcmpeqb 16(%rdi, %rsi, 1), %xmm1 # end of string in RSI? pcmpeqb 16(%rdi), %xmm0 # where do the chunks match? pmovmskb %xmm1, %r8d pmovmskb %xmm0, %r9d add $32, %rdi test %r8d, %r8d jnz .Lnul_found2 xor $0xffff, %r9d # any mismatches? jz 0b sub $16, %rdi # roll back second increment /* a mismatch has been found between RDX and RSI */ .Lmismatch: tzcnt %r9d, %r9d # where is the mismatch? add %rdi, %rdx # turn RDX from offset to pointer movzbl (%rdx, %r9, 1), %ecx movzbl (%rdi, %r9, 1), %eax sub %ecx, %eax # difference of the mismatching chars ret /* mismatch in true heads */ .Lhead_mismatch: tzcnt %r9d, %r9d # where is the mismatch? add %rax, %rdi # return to true heads add %rdx, %rsi movzbl (%rdi, %r9, 1), %eax # mismatching characters movzbl (%rsi, %r9, 1), %ecx sub %ecx, %eax ret .Lnul_found2: sub $16, %rdi # roll back second increment /* a NUL has been found in RSI */ .Lnul_found: mov %eax, %ecx mov %r8d, %r10d shl %cl, %r8w # adjust NUL mask to positions in RDI/RDX xor $0xffff, %r9d # mask of mismatches or %r8d, %r9d # NUL bytes also count as mismatches jnz .Lmismatch /* * (RDI) == (RSI) and NUL is past the string. * Compare (RSI) with the corresponding part * of the other string until the NUL byte. */ movdqu (%rdi, %rax, 1), %xmm0 pcmpeqb (%rdi, %rsi, 1), %xmm0 add %rdi, %rsi # restore RSI pointer add %rax, %rdi # point RDI to chunk corresponding to (RSI) pmovmskb %xmm0, %ecx # mask of matches not %ecx # mask of mismatches or %r10d, %ecx # mask of mismatches or NUL bytes tzcnt %ecx, %ecx # location of first mismatch movzbl (%rdi, %rcx, 1), %eax movzbl (%rsi, %rcx, 1), %ecx sub %ecx, %eax ret /* * If (a&0xf) < (b&0xf), we do the same thing but with swapped * operands. I found that this performs slightly better than * using conditional moves to do the swap branchless. */ .Lswapped: movdqu 16(%rdi, %rax, 1), %xmm0 sub %rsi, %rdi # express RDI as distance from RSI lea (%rdi, %rax, 1), %rdx # point RDX to offset in RDI corresponding to RSI neg %rax # make difference positive pcmpeqb %xmm2, %xmm1 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm1, %r8d pmovmskb %xmm0, %r9d add $16, %rsi # advance aligned pointers test %r8d, %r8d jnz .Lnul_founds xor $0xffff, %r9d jnz .Lmismatchs add $16, %rsi /* * During the main loop, the layout of the two strings is something like: * * v ------1------ v ------2------ v * RDI: AAAAAAAAAAAAABBBBBBBBBBBBBBBB... * RSI: AAAAAAAAAAAAABBBBBBBBBBBBBBBBCCC... * * where v indicates the alignment boundaries and corresponding chunks * of the strings have the same letters. Chunk A has been checked in * the previous iteration. This iteration, we first check that string * RSI doesn't end within region 2, then we compare chunk B between the * two strings. As RSI is known not to hold a NUL byte in regsions 1 * and 2 at this point, this also ensures that RDI has not ended yet. */ ALIGN_TEXT 0: movdqu (%rsi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI? pxor %xmm1, %xmm1 pcmpeqb (%rsi, %rdi, 1), %xmm1 # end of string in RSI? pcmpeqb (%rsi), %xmm0 # where do the chunks match? pmovmskb %xmm1, %r8d pmovmskb %xmm0, %r9d test %r8d, %r8d jnz .Lnul_founds xor $0xffff, %r9d # any mismatches? jnz .Lmismatchs /* main loop unrolled twice */ movdqu 16(%rsi, %rdx, 1), %xmm0 # chunk of 2nd string corresponding to RDI? pxor %xmm1, %xmm1 pcmpeqb 16(%rsi, %rdi, 1), %xmm1 # end of string in RSI? pcmpeqb 16(%rsi), %xmm0 # where do the chunks match? pmovmskb %xmm1, %r8d pmovmskb %xmm0, %r9d add $32, %rsi test %r8d, %r8d jnz .Lnul_found2s xor $0xffff, %r9d # any mismatches? jz 0b sub $16, %rsi # roll back second increment /* a mismatch has been found between RDX and RDI */ .Lmismatchs: tzcnt %r9d, %r9d # where is the mismatch? add %rsi, %rdx # turn RDX from offset to pointer movzbl (%rdx, %r9, 1), %eax movzbl (%rsi, %r9, 1), %ecx sub %ecx, %eax # difference of the mismatching chars ret .Lnul_found2s: sub $16, %rsi # roll back second increment /* a NUL has been found in RSI */ .Lnul_founds: mov %eax, %ecx mov %r8d, %r10d shl %cl, %r8w # adjust NUL mask to positions in RDI/RDX xor $0xffff, %r9d # mask of mismatches or %r8d, %r9d # NUL bytes also count as mismatches jnz .Lmismatchs /* * (RDI) == (RSI) and NUL is past the string. * Compare (RSI) with the corresponding part * of the other string until the NUL byte. */ movdqu (%rsi, %rax, 1), %xmm0 pcmpeqb (%rsi, %rdi, 1), %xmm0 add %rsi, %rdi # restore RDI pointer add %rax, %rsi # point RSI to chunk corresponding to (RDI) pmovmskb %xmm0, %ecx # mask of matches not %ecx # mask of mismatches or %r10d, %ecx # mask of mismatches or NUL bytes tzcnt %ecx, %ecx # location of first mismatch movzbl (%rdi, %rcx, 1), %eax movzbl (%rsi, %rcx, 1), %ecx sub %ecx, %eax ret ARCHEND(strcmp, baseline) .section .note.GNU-stack,"",%progbits