/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2009, Intel Corporation * All rights reserved. */ /* * str[n]cpy - copy [n] chars from second operand into first operand */ #include "SYS.h" #include "proc64_id.h" #define LABEL(s) .strcpy/**/s #ifdef USE_AS_STRNCPY ENTRY(strncpy) test %edx, %edx jz LABEL(strncpy_exitz) mov %rdx, %r8 #else ENTRY(strcpy) /* (char *, const char *) */ xor %rdx, %rdx #endif mov %esi, %ecx and $0xfffffffffffffff0, %rsi /* force rsi 16 byte align */ and $0xf, %rcx mov %rdi, %rax /* save destination address for return value */ pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ pcmpeqb (%rsi), %xmm0 /* check 16 bytes in src for null */ pmovmskb %xmm0, %edx shr %cl, %edx /* adjust for offset from 16byte boundary */ test %edx, %edx /* edx will be 0 if chars are non-null */ jnz LABEL(less16bytes) /* null char found in first 16 bytes examined */ #ifdef USE_AS_STRNCPY /* * Check if the count is satisfied in first 16 bytes examined. */ lea -16(%r8, %rcx), %r11 cmp $0, %r11 jle LABEL(less16bytes) #endif mov %rcx, %r9 /* rsi alignment offset */ or %edi, %ecx and $0xf, %ecx lea -16(%r9), %r10 jz LABEL(ashr_0) /* src and dest are both 16 byte aligned */ neg %r10 /* max src bytes remaining in current dqword */ pxor %xmm0, %xmm0 /* clear %xmm0, may be polluted by unaligned operation */ pcmpeqb 16(%rsi), %xmm0 /* check next 16 bytes in src for a null */ pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(less32bytes) /* null char found in first 32 bytes examined */ #ifdef USE_AS_STRNCPY /* * If strncpy count <= 16 go to exit case */ sub $16, %r8 jbe LABEL(less32bytes_strncpy_truncation) #endif /* * At least 16 bytes to copy to destination string. Move them now. * Don't worry about alignment. */ mov (%rsi, %r9), %rdx mov %rdx, (%rdi) mov 8(%rsi, %r9), %rdx mov %rdx, 8(%rdi) /* * so far destination rdi may be aligned by 16, re-calculate rsi and * jump to corresponding src/dest relative offset case. * rcx is offset of rsi * rdx is offset of rdi */ and $0xfffffffffffffff0, %rdi /* force rdi 16 byte align */ mov %rax, %rdx /* rax contains orignal rdi */ xor %rdi, %rdx /* same effect as "and $0xf, %rdx" */ #ifdef USE_AS_STRNCPY /* * Will now do 16 byte aligned stores. Stores may overlap some bytes * (ie store twice) if destination was unaligned. Compensate here. */ add %rdx, %r8 /* compensate for overlap */ #endif add $16, %rdi /* next 16 bytes for dest */ /* * align src to 16-byte boundary. Could be up or down depending on * whether src offset - dest offset > 0 (up) or * src offset - dest offset < 0 (down). */ sub %rdx, %r9 /* src offset - dest offset */ lea 16(%r9, %rsi), %rsi mov %esi, %ecx /* for new src offset */ and $0xfffffffffffffff0, %rsi /* force rsi 16 byte align */ and $0xf, %ecx /* new src offset is 0 if rsi/rdi have same alignment */ jz LABEL(ashr_0) #ifdef USE_AS_STRNCPY xor %edx, %edx /* In case unaligned_exit is taken */ #endif /* * Jump to case corresponding to source/dest string relative offsets * Index = (16 + (src offset - dest offset)) % 16 */ lea -16(%rcx), %r10 mov %rcx, %r9 neg %r10 /* max src bytes remaining in current dqword */ lea LABEL(unaligned_table)(%rip), %r11 movslq (%r11, %rcx, 4), %rcx lea (%r11, %rcx), %rcx jmp *%rcx /* * ashr_0 handles the following cases: * src alignment offset = dest alignment offset */ .p2align 5 LABEL(ashr_0): #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_aligned) #endif movdqa (%rsi), %xmm1 /* fetch 16 bytes from src string */ movdqa %xmm1, (%rdi) /* store 16 bytes into dest string */ add $16, %rsi add $16, %rdi pcmpeqb (%rsi), %xmm0 /* check 16 bytes in src for a null */ pmovmskb %xmm0, %edx test %edx, %edx /* edx will be 0 if chars are non-null */ jnz LABEL(aligned_16bytes) /* exit tail */ LABEL(ashr_0_loop): #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_aligned) #endif movdqa (%rsi, %rcx), %xmm1 movdqa %xmm1, (%rdi, %rcx) add $16, %rcx pcmpeqb (%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(aligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_aligned) #endif movdqa (%rsi, %rcx), %xmm1 movdqa %xmm1, (%rdi, %rcx) add $16, %rcx pcmpeqb (%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(aligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_aligned) #endif movdqa (%rsi, %rcx), %xmm1 movdqa %xmm1, (%rdi, %rcx) add $16, %rcx pcmpeqb (%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(aligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_aligned) #endif movdqa (%rsi, %rcx), %xmm1 movdqa %xmm1, (%rdi, %rcx) add $16, %rcx pcmpeqb (%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jz LABEL(ashr_0_loop) jmp LABEL(aligned_exit) /* * ashr_15 handles the following cases: * (16 + (src offset - dest offset)) % 16 = 15 * * Based on above operation, start from (%r9 + rsi) to the left of this cache * bank, there is no null byte. */ .p2align 4 LABEL(ashr_15): xor %ecx, %ecx /* clear index */ #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ jz LABEL(ashr_15_use_sse2) .p2align 4 LABEL(ashr_15_use_ssse3): movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif #palignr $15, (%rsi, %rcx), %xmm3 .byte 0x66, 0x0F, 0x3A ,0x0F .byte 0x1c, 0x0e, 0x0f movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif #palignr $15, (%rsi, %rcx), %xmm3 .byte 0x66, 0x0F, 0x3A ,0x0F .byte 0x1c, 0x0e, 0x0f movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_15_use_ssse3) .p2align 4 LABEL(ashr_15_use_sse2): pcmpeqb 16(%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif movdqa 16(%rsi, %rcx), %xmm3 movdqa (%rsi, %rcx), %xmm2 psrldq $15, %xmm2 pslldq $1, %xmm3 por %xmm2, %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif pcmpeqb 16(%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif movdqa 16(%rsi, %rcx), %xmm3 movdqa (%rsi, %rcx), %xmm2 psrldq $15, %xmm2 pslldq $1, %xmm3 por %xmm2, %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_15_use_sse2) /* * ashr_14 handles the following cases: * (16 + (src offset - dest offset)) % 16 = 14 * * Based on above operation, start from (%r9 + rsi) to the left of this cache * bank, there is no null byte. */ .p2align 4 LABEL(ashr_14): xor %ecx, %ecx /* clear index */ #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ jz LABEL(ashr_14_use_sse2) .p2align 4 LABEL(ashr_14_use_ssse3): movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif #palignr $14, (%rsi, %rcx), %xmm3 .byte 0x66, 0x0F, 0x3A ,0x0F .byte 0x1c, 0x0e, 0x0e movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif #palignr $14, (%rsi, %rcx), %xmm3 .byte 0x66, 0x0F, 0x3A ,0x0F .byte 0x1c, 0x0e, 0x0e movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_14_use_ssse3) .p2align 4 LABEL(ashr_14_use_sse2): pcmpeqb 16(%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif movdqa 16(%rsi, %rcx), %xmm3 movdqa (%rsi, %rcx), %xmm2 psrldq $14, %xmm2 pslldq $2, %xmm3 por %xmm2, %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif pcmpeqb 16(%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif movdqa 16(%rsi, %rcx), %xmm3 movdqa (%rsi, %rcx), %xmm2 psrldq $14, %xmm2 pslldq $2, %xmm3 por %xmm2, %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_14_use_sse2) /* * ashr_13 handles the following cases: * (16 + (src offset - dest offset)) % 16 = 13 * * Based on above operation, start from (%r9 + rsi) to the left of this cache * bank, there is no null byte. */ .p2align 4 LABEL(ashr_13): xor %ecx, %ecx /* clear index */ #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ jz LABEL(ashr_13_use_sse2) .p2align 4 LABEL(ashr_13_use_ssse3): movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif #palignr $13, (%rsi, %rcx), %xmm3 .byte 0x66, 0x0F, 0x3A ,0x0F .byte 0x1c, 0x0e, 0x0d movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif #palignr $13, (%rsi, %rcx), %xmm3 .byte 0x66, 0x0F, 0x3A ,0x0F .byte 0x1c, 0x0e, 0x0d movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_13_use_ssse3) .p2align 4 LABEL(ashr_13_use_sse2): pcmpeqb 16(%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif movdqa 16(%rsi, %rcx), %xmm3 movdqa (%rsi, %rcx), %xmm2 psrldq $13, %xmm2 pslldq $3, %xmm3 por %xmm2, %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif pcmpeqb 16(%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif movdqa 16(%rsi, %rcx), %xmm3 movdqa (%rsi, %rcx), %xmm2 psrldq $13, %xmm2 pslldq $3, %xmm3 por %xmm2, %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_13_use_sse2) /* * ashr_12 handles the following cases: * (16 + (src offset - dest offset)) % 16 = 12 * * Based on above operation, start from (%r9 + rsi) to the left of this cache * bank, there is no null byte. */ .p2align 4 LABEL(ashr_12): xor %ecx, %ecx /* clear index */ #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ jz LABEL(ashr_12_use_sse2) .p2align 4 LABEL(ashr_12_use_ssse3): movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif #palignr $12, (%rsi, %rcx), %xmm3 .byte 0x66, 0x0F, 0x3A ,0x0F .byte 0x1c, 0x0e, 0x0c movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif #palignr $12, (%rsi, %rcx), %xmm3 .byte 0x66, 0x0F, 0x3A ,0x0F .byte 0x1c, 0x0e, 0x0c movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_12_use_ssse3) .p2align 4 LABEL(ashr_12_use_sse2): pcmpeqb 16(%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif movdqa 16(%rsi, %rcx), %xmm3 movdqa (%rsi, %rcx), %xmm2 psrldq $12, %xmm2 pslldq $4, %xmm3 por %xmm2, %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif pcmpeqb 16(%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif movdqa 16(%rsi, %rcx), %xmm3 movdqa (%rsi, %rcx), %xmm2 psrldq $12, %xmm2 pslldq $4, %xmm3 por %xmm2, %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_12_use_sse2) /* * ashr_11 handles the following cases: * (16 + (src offset - dest offset)) % 16 = 11 * * Based on above operation, start from (%r9 + rsi) to the left of this cache * bank, there is no null byte. */ .p2align 4 LABEL(ashr_11): xor %ecx, %ecx /* clear index */ #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ jz LABEL(ashr_11_use_sse2) .p2align 4 LABEL(ashr_11_use_ssse3): movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif #palignr $11, (%rsi, %rcx), %xmm3 .byte 0x66, 0x0F, 0x3A ,0x0F .byte 0x1c, 0x0e, 0x0b movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif #palignr $11, (%rsi, %rcx), %xmm3 .byte 0x66, 0x0F, 0x3A ,0x0F .byte 0x1c, 0x0e, 0x0b movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_11_use_ssse3) .p2align 4 LABEL(ashr_11_use_sse2): pcmpeqb 16(%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif movdqa 16(%rsi, %rcx), %xmm3 movdqa (%rsi, %rcx), %xmm2 psrldq $11, %xmm2 pslldq $5, %xmm3 por %xmm2, %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif pcmpeqb 16(%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif movdqa 16(%rsi, %rcx), %xmm3 movdqa (%rsi, %rcx), %xmm2 psrldq $11, %xmm2 pslldq $5, %xmm3 por %xmm2, %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_11_use_sse2) /* * ashr_10 handles the following cases: * (16 + (src offset - dest offset)) % 16 = 10 * * Based on above operation, start from (%r9 + rsi) to the left of this cache * bank, there is no null byte. */ .p2align 4 LABEL(ashr_10): xor %ecx, %ecx /* clear index */ #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ jz LABEL(ashr_10_use_sse2) .p2align 4 LABEL(ashr_10_use_ssse3): movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif #palignr $10, (%rsi, %rcx), %xmm3 .byte 0x66, 0x0F, 0x3A ,0x0F .byte 0x1c, 0x0e, 0x0a movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif #palignr $10, (%rsi, %rcx), %xmm3 .byte 0x66, 0x0F, 0x3A ,0x0F .byte 0x1c, 0x0e, 0x0a movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_10_use_ssse3) .p2align 4 LABEL(ashr_10_use_sse2): pcmpeqb 16(%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif movdqa 16(%rsi, %rcx), %xmm3 movdqa (%rsi, %rcx), %xmm2 psrldq $10, %xmm2 pslldq $6, %xmm3 por %xmm2, %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif pcmpeqb 16(%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif movdqa 16(%rsi, %rcx), %xmm3 movdqa (%rsi, %rcx), %xmm2 psrldq $10, %xmm2 pslldq $6, %xmm3 por %xmm2, %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_10_use_sse2) /* * ashr_9 handles the following cases: * (16 + (src offset - dest offset)) % 16 = 9 * * Based on above operation, start from (%r9 + rsi) to the left of this cache * bank, there is no null byte. */ .p2align 4 LABEL(ashr_9): xor %ecx, %ecx /* clear index */ #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ jz LABEL(ashr_9_use_sse2) .p2align 4 LABEL(ashr_9_use_ssse3): movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif #palignr $9, (%rsi, %rcx), %xmm3 .byte 0x66, 0x0F, 0x3A ,0x0F .byte 0x1c, 0x0e, 0x09 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif #palignr $9, (%rsi, %rcx), %xmm3 .byte 0x66, 0x0F, 0x3A ,0x0F .byte 0x1c, 0x0e, 0x09 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_9_use_ssse3) .p2align 4 LABEL(ashr_9_use_sse2): pcmpeqb 16(%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif movdqa 16(%rsi, %rcx), %xmm3 movdqa (%rsi, %rcx), %xmm2 psrldq $9, %xmm2 pslldq $7, %xmm3 por %xmm2, %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif pcmpeqb 16(%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif movdqa 16(%rsi, %rcx), %xmm3 movdqa (%rsi, %rcx), %xmm2 psrldq $9, %xmm2 pslldq $7, %xmm3 por %xmm2, %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_9_use_sse2) /* * ashr_8 handles the following cases: * (16 + (src offset - dest offset)) % 16 = 8 * * Based on above operation, start from (%r9 + rsi) to the left of this cache * bank, there is no null byte. */ .p2align 4 LABEL(ashr_8): xor %ecx, %ecx /* clear index */ #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ jz LABEL(ashr_8_use_sse2) .p2align 4 LABEL(ashr_8_use_ssse3): movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif #palignr $8, (%rsi, %rcx), %xmm3 .byte 0x66, 0x0F, 0x3A ,0x0F .byte 0x1c, 0x0e, 0x08 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif #palignr $8, (%rsi, %rcx), %xmm3 .byte 0x66, 0x0F, 0x3A ,0x0F .byte 0x1c, 0x0e, 0x08 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_8_use_ssse3) .p2align 4 LABEL(ashr_8_use_sse2): pcmpeqb 16(%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif movdqa 16(%rsi, %rcx), %xmm3 movdqa (%rsi, %rcx), %xmm2 psrldq $8, %xmm2 pslldq $8, %xmm3 por %xmm2, %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif pcmpeqb 16(%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif movdqa 16(%rsi, %rcx), %xmm3 movdqa (%rsi, %rcx), %xmm2 psrldq $8, %xmm2 pslldq $8, %xmm3 por %xmm2, %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_8_use_sse2) /* * ashr_7 handles the following cases: * (16 + (src offset - dest offset)) % 16 = 7 * * Based on above operation, start from (%r9 + rsi) to the left of this cache * bank, there is no null byte. */ .p2align 4 LABEL(ashr_7): xor %ecx, %ecx /* clear index */ #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ jz LABEL(ashr_7_use_sse2) .p2align 4 LABEL(ashr_7_use_ssse3): movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif #palignr $7, (%rsi, %rcx), %xmm3 .byte 0x66, 0x0F, 0x3A ,0x0F .byte 0x1c, 0x0e, 0x07 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif #palignr $7, (%rsi, %rcx), %xmm3 .byte 0x66, 0x0F, 0x3A ,0x0F .byte 0x1c, 0x0e, 0x07 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_7_use_ssse3) .p2align 4 LABEL(ashr_7_use_sse2): pcmpeqb 16(%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif movdqa 16(%rsi, %rcx), %xmm3 movdqa (%rsi, %rcx), %xmm2 psrldq $7, %xmm2 pslldq $9, %xmm3 por %xmm2, %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif pcmpeqb 16(%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif movdqa 16(%rsi, %rcx), %xmm3 movdqa (%rsi, %rcx), %xmm2 psrldq $7, %xmm2 pslldq $9, %xmm3 por %xmm2, %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_7_use_sse2) /* * ashr_6 handles the following cases: * (16 + (src offset - dest offset)) % 16 = 6 * * Based on above operation, start from (%r9 + rsi) to the left of this cache * bank, there is no null byte. */ .p2align 4 LABEL(ashr_6): xor %ecx, %ecx /* clear index */ #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ jz LABEL(ashr_6_use_sse2) .p2align 4 LABEL(ashr_6_use_ssse3): movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif #palignr $6, (%rsi, %rcx), %xmm3 .byte 0x66, 0x0F, 0x3A ,0x0F .byte 0x1c, 0x0e, 0x06 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif #palignr $6, (%rsi, %rcx), %xmm3 .byte 0x66, 0x0F, 0x3A ,0x0F .byte 0x1c, 0x0e, 0x06 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_6_use_ssse3) .p2align 4 LABEL(ashr_6_use_sse2): pcmpeqb 16(%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif movdqa 16(%rsi, %rcx), %xmm3 movdqa (%rsi, %rcx), %xmm2 psrldq $6, %xmm2 pslldq $10, %xmm3 por %xmm2, %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif pcmpeqb 16(%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif movdqa 16(%rsi, %rcx), %xmm3 movdqa (%rsi, %rcx), %xmm2 psrldq $6, %xmm2 pslldq $10, %xmm3 por %xmm2, %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_6_use_sse2) /* * ashr_5 handles the following cases: * (16 + (src offset - dest offset)) % 16 = 5 * * Based on above operation, start from (%r9 + rsi) to the left of this cache * bank, there is no null byte. */ .p2align 4 LABEL(ashr_5): xor %ecx, %ecx /* clear index */ #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ jz LABEL(ashr_5_use_sse2) .p2align 4 LABEL(ashr_5_use_ssse3): movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif #palignr $5, (%rsi, %rcx), %xmm3 .byte 0x66, 0x0F, 0x3A ,0x0F .byte 0x1c, 0x0e, 0x05 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif #palignr $5, (%rsi, %rcx), %xmm3 .byte 0x66, 0x0F, 0x3A ,0x0F .byte 0x1c, 0x0e, 0x05 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_5_use_ssse3) .p2align 4 LABEL(ashr_5_use_sse2): pcmpeqb 16(%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif movdqa 16(%rsi, %rcx), %xmm3 movdqa (%rsi, %rcx), %xmm2 psrldq $5, %xmm2 pslldq $11, %xmm3 por %xmm2, %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif pcmpeqb 16(%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif movdqa 16(%rsi, %rcx), %xmm3 movdqa (%rsi, %rcx), %xmm2 psrldq $5, %xmm2 pslldq $11, %xmm3 por %xmm2, %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_5_use_sse2) /* * ashr_4 handles the following cases: * (16 + (src offset - dest offset)) % 16 = 4 * * Based on above operation, start from (%r9 + rsi) to the left of this cache * bank, there is no null byte. */ .p2align 4 LABEL(ashr_4): xor %ecx, %ecx /* clear index */ #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ jz LABEL(ashr_4_use_sse2) .p2align 4 LABEL(ashr_4_use_ssse3): movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif #palignr $4, (%rsi, %rcx), %xmm3 .byte 0x66, 0x0F, 0x3A ,0x0F .byte 0x1c, 0x0e, 0x04 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif #palignr $4, (%rsi, %rcx), %xmm3 .byte 0x66, 0x0F, 0x3A ,0x0F .byte 0x1c, 0x0e, 0x04 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_4_use_ssse3) .p2align 4 LABEL(ashr_4_use_sse2): pcmpeqb 16(%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif movdqa 16(%rsi, %rcx), %xmm3 movdqa (%rsi, %rcx), %xmm2 psrldq $4, %xmm2 pslldq $12, %xmm3 por %xmm2, %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif pcmpeqb 16(%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif movdqa 16(%rsi, %rcx), %xmm3 movdqa (%rsi, %rcx), %xmm2 psrldq $4, %xmm2 pslldq $12, %xmm3 por %xmm2, %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_4_use_sse2) /* * ashr_3 handles the following cases: * (16 + (src offset - dest offset)) % 16 = 3 * * Based on above operation, start from (%r9 + rsi) to the left of this cache * bank, there is no null byte. */ .p2align 4 LABEL(ashr_3): xor %ecx, %ecx /* clear index */ #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ jz LABEL(ashr_3_use_sse2) .p2align 4 LABEL(ashr_3_use_ssse3): movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif #palignr $3, (%rsi, %rcx), %xmm3 .byte 0x66, 0x0F, 0x3A ,0x0F .byte 0x1c, 0x0e, 0x03 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif #palignr $3, (%rsi, %rcx), %xmm3 .byte 0x66, 0x0F, 0x3A ,0x0F .byte 0x1c, 0x0e, 0x03 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_3_use_ssse3) .p2align 4 LABEL(ashr_3_use_sse2): pcmpeqb 16(%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif movdqa 16(%rsi, %rcx), %xmm3 movdqa (%rsi, %rcx), %xmm2 psrldq $3, %xmm2 pslldq $13, %xmm3 por %xmm2, %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif pcmpeqb 16(%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif movdqa 16(%rsi, %rcx), %xmm3 movdqa (%rsi, %rcx), %xmm2 psrldq $3, %xmm2 pslldq $13, %xmm3 por %xmm2, %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_3_use_sse2) /* * ashr_2 handles the following cases: * (16 + (src offset - dest offset)) % 16 = 2 * * Based on above operation, start from (%r9 + rsi) to the left of this cache * bank, there is no null byte. */ .p2align 4 LABEL(ashr_2): xor %ecx, %ecx /* clear index */ #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ jz LABEL(ashr_2_use_sse2) .p2align 4 LABEL(ashr_2_use_ssse3): movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif #palignr $2, (%rsi, %rcx), %xmm3 .byte 0x66, 0x0F, 0x3A ,0x0F .byte 0x1c, 0x0e, 0x02 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif #palignr $2, (%rsi, %rcx), %xmm3 .byte 0x66, 0x0F, 0x3A ,0x0F .byte 0x1c, 0x0e, 0x02 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_2_use_ssse3) .p2align 4 LABEL(ashr_2_use_sse2): pcmpeqb 16(%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif movdqa 16(%rsi, %rcx), %xmm3 movdqa (%rsi, %rcx), %xmm2 psrldq $2, %xmm2 pslldq $14, %xmm3 por %xmm2, %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif pcmpeqb 16(%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif movdqa 16(%rsi, %rcx), %xmm3 movdqa (%rsi, %rcx), %xmm2 psrldq $2, %xmm2 pslldq $14, %xmm3 por %xmm2, %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_2_use_sse2) /* * ashr_1 handles the following cases: * (16 + (src offset - dest offset)) % 16 = 1 * * Based on above operation, start from (%r9 + rsi) to the left of this cache * bank, there is no null byte. */ .p2align 4 LABEL(ashr_1): xor %ecx, %ecx /* clear index */ #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif testl $USE_SSSE3, .memops_method(%rip) /* use sse2 or ssse3? */ jz LABEL(ashr_1_use_sse2) .p2align 4 LABEL(ashr_1_use_ssse3): movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif #palignr $1, (%rsi, %rcx), %xmm3 .byte 0x66, 0x0F, 0x3A ,0x0F .byte 0x1c, 0x0e, 0x01 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif movdqa 16(%rsi, %rcx), %xmm3 pcmpeqb %xmm3, %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif #palignr $1, (%rsi, %rcx), %xmm3 .byte 0x66, 0x0F, 0x3A ,0x0F .byte 0x1c, 0x0e, 0x01 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_1_use_ssse3) .p2align 4 LABEL(ashr_1_use_sse2): pcmpeqb 16(%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif movdqa 16(%rsi, %rcx), %xmm3 movdqa (%rsi, %rcx), %xmm2 psrldq $1, %xmm2 pslldq $15, %xmm3 por %xmm2, %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif pcmpeqb 16(%rsi, %rcx), %xmm0 pmovmskb %xmm0, %edx test %edx, %edx jnz LABEL(unaligned_exit) #ifdef USE_AS_STRNCPY sub $16, %r8 jbe LABEL(strncpy_truncation_unaligned) #endif movdqa 16(%rsi, %rcx), %xmm3 movdqa (%rsi, %rcx), %xmm2 psrldq $1, %xmm2 pslldq $15, %xmm3 por %xmm2, %xmm3 movdqa %xmm3, (%rdi, %rcx) add $16, %rcx #ifdef USE_AS_STRNCPY cmp %r10, %r8 jbe LABEL(unaligned_exit) #endif jmp LABEL(ashr_1_use_sse2) /* * Exit tail code: * Up to 32 bytes are copied in the case of strcpy. */ .p2align 4 LABEL(less32bytes): xor %ecx, %ecx LABEL(unaligned_exit): add %r9, %rsi /* r9 holds offset of rsi */ mov %rcx, %r9 mov %r10, %rcx shl %cl, %edx /* after shl, calculate the exact number to be filled */ mov %r9, %rcx .p2align 4 LABEL(aligned_exit): add %rcx, %rdi /* locate exact address for rdi */ LABEL(less16bytes): add %rcx, %rsi /* locate exact address for rsi */ LABEL(aligned_16bytes): #ifdef USE_AS_STRNCPY /* * Null found in 16bytes checked. Set bit in bitmask corresponding to * the strncpy count argument. We will copy to the null (inclusive) * or count whichever comes first. */ mov $1, %r9d lea -1(%r8), %rcx shl %cl, %r9d cmp $32, %r8 ja LABEL(strncpy_tail) or %r9d, %edx LABEL(strncpy_tail): #endif /* * Check to see if BSF is fast on this processor. If not, use a * different exit tail. */ testb $USE_BSF, .memops_method(%rip) jz LABEL(AMD_exit) bsf %rdx, %rcx /* Find byte with null char */ lea LABEL(tail_table)(%rip), %r11 movslq (%r11, %rcx, 4), %rcx lea (%r11, %rcx), %rcx jmp *%rcx #ifdef USE_AS_STRNCPY /* * Count reached before null found. */ .p2align 4 LABEL(less32bytes_strncpy_truncation): xor %ecx, %ecx LABEL(strncpy_truncation_unaligned): add %r9, %rsi /* next src char to copy */ LABEL(strncpy_truncation_aligned): add %rcx, %rdi add %rcx, %rsi add $16, %r8 /* compensation */ lea -1(%r8), %rcx lea LABEL(tail_table)(%rip), %r11 movslq (%r11, %rcx, 4), %rcx lea (%r11, %rcx), %rcx jmp *%rcx .p2align 4 LABEL(strncpy_exitz): mov %rdi, %rax ret #endif .p2align 4 LABEL(AMD_exit): test %dl, %dl jz LABEL(AMD_exit_more_8) test $0x01, %dl jnz LABEL(tail_0) test $0x02, %dl jnz LABEL(tail_1) test $0x04, %dl jnz LABEL(tail_2) test $0x08, %dl jnz LABEL(tail_3) test $0x10, %dl jnz LABEL(tail_4) test $0x20, %dl jnz LABEL(tail_5) test $0x40, %dl jnz LABEL(tail_6) .p2align 4 LABEL(tail_7): /* 8 bytes */ mov (%rsi), %rcx mov %rcx, (%rdi) #ifdef USE_AS_STRNCPY mov $8, %cl sub $8, %r8 jnz LABEL(strncpy_fill_tail) #endif ret #ifdef USE_AS_STRNCPY /* * Null terminated src string shorter than count. Fill the rest of the * destination with null chars. */ .p2align 4 LABEL(strncpy_fill_tail): mov %rax, %rdx movzx %cl, %rax mov %r8, %rcx add %rax, %rdi xor %eax, %eax shr $3, %ecx jz LABEL(strncpy_fill_less_8) rep stosq LABEL(strncpy_fill_less_8): mov %r8, %rcx and $7, %rcx jz LABEL(strncpy_fill_return) LABEL(strncpy_fill_less_7): sub $1, %ecx mov %al, (%rdi, %rcx) jnz LABEL(strncpy_fill_less_7) LABEL(strncpy_fill_return): mov %rdx, %rax ret #endif .p2align 4 LABEL(tail_0): /* 1 byte */ mov (%rsi), %cl mov %cl, (%rdi) #ifdef USE_AS_STRNCPY mov $1, %cl sub $1, %r8 jnz LABEL(strncpy_fill_tail) #endif ret .p2align 4 LABEL(tail_1): /* 2 bytes */ mov (%rsi), %cx mov %cx, (%rdi) #ifdef USE_AS_STRNCPY mov $2, %cl sub $2, %r8 jnz LABEL(strncpy_fill_tail) #endif ret .p2align 4 LABEL(tail_2): /* 3 bytes */ mov (%rsi), %cx mov %cx, (%rdi) mov 1(%rsi), %cx mov %cx, 1(%rdi) #ifdef USE_AS_STRNCPY mov $3, %cl sub $3, %r8 jnz LABEL(strncpy_fill_tail) #endif ret .p2align 4 LABEL(tail_3): /* 4 bytes */ mov (%rsi), %ecx mov %ecx, (%rdi) #ifdef USE_AS_STRNCPY mov $4, %cl sub $4, %r8 jnz LABEL(strncpy_fill_tail) #endif ret .p2align 4 LABEL(tail_4): /* 5 bytes */ mov (%rsi), %ecx mov %ecx, (%rdi) mov 1(%rsi), %edx mov %edx, 1(%rdi) #ifdef USE_AS_STRNCPY mov $5, %cl sub $5, %r8 jnz LABEL(strncpy_fill_tail) #endif ret .p2align 4 LABEL(tail_5): /* 6 bytes */ mov (%rsi), %ecx mov %ecx, (%rdi) mov 2(%rsi), %edx mov %edx, 2(%rdi) #ifdef USE_AS_STRNCPY mov $6, %cl sub $6, %r8 jnz LABEL(strncpy_fill_tail) #endif ret .p2align 4 LABEL(tail_6): /* 7 bytes */ mov (%rsi), %ecx mov %ecx, (%rdi) mov 3(%rsi), %edx mov %edx,3(%rdi) #ifdef USE_AS_STRNCPY mov $7, %cl sub $7, %r8 jnz LABEL(strncpy_fill_tail) #endif ret .p2align 4 LABEL(tail_8): /* 9 bytes */ mov (%rsi), %rcx mov %rcx, (%rdi) mov 5(%rsi), %edx mov %edx, 5(%rdi) #ifdef USE_AS_STRNCPY mov $9, %cl sub $9, %r8 jnz LABEL(strncpy_fill_tail) #endif ret .p2align 4 LABEL(AMD_exit_more_8): test %dh, %dh jz LABEL(AMD_exit_more_16) test $0x01, %dh jnz LABEL(tail_8) test $0x02, %dh jnz LABEL(tail_9) test $0x04, %dh jnz LABEL(tail_10) test $0x08, %dh jnz LABEL(tail_11) test $0x10, %dh jnz LABEL(tail_12) test $0x20, %dh jnz LABEL(tail_13) test $0x40, %dh jnz LABEL(tail_14) .p2align 4 LABEL(tail_15): /* 16 bytes */ mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) #ifdef USE_AS_STRNCPY mov $16, %cl sub $16, %r8 jnz LABEL(strncpy_fill_tail) #endif ret .p2align 4 LABEL(tail_9): /* 10 bytes */ mov (%rsi), %rcx mov %rcx, (%rdi) mov 6(%rsi), %edx mov %edx, 6(%rdi) #ifdef USE_AS_STRNCPY mov $10, %cl sub $10, %r8 jnz LABEL(strncpy_fill_tail) #endif ret .p2align 4 LABEL(tail_10): /* 11 bytes */ mov (%rsi), %rcx mov %rcx, (%rdi) mov 7(%rsi), %edx mov %edx, 7(%rdi) #ifdef USE_AS_STRNCPY mov $11, %cl sub $11, %r8 jnz LABEL(strncpy_fill_tail) #endif ret .p2align 4 LABEL(tail_11): /* 12 bytes */ mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %edx mov %edx, 8(%rdi) #ifdef USE_AS_STRNCPY mov $12, %cl sub $12, %r8 jnz LABEL(strncpy_fill_tail) #endif ret .p2align 4 LABEL(tail_12): /* 13 bytes */ mov (%rsi), %rcx mov %rcx, (%rdi) mov 5(%rsi), %rcx mov %rcx, 5(%rdi) #ifdef USE_AS_STRNCPY mov $13, %cl sub $13, %r8 jnz LABEL(strncpy_fill_tail) #endif ret .p2align 4 LABEL(tail_13): /* 14 bytes */ mov (%rsi), %rcx mov %rcx, (%rdi) mov 6(%rsi), %rcx mov %rcx, 6(%rdi) #ifdef USE_AS_STRNCPY mov $14, %cl sub $14, %r8 jnz LABEL(strncpy_fill_tail) #endif ret .p2align 4 LABEL(tail_14): /* 15 bytes */ mov (%rsi), %rcx mov %rcx, (%rdi) mov 7(%rsi), %rcx mov %rcx, 7(%rdi) #ifdef USE_AS_STRNCPY mov $15, %cl sub $15, %r8 jnz LABEL(strncpy_fill_tail) #endif ret .p2align 4 LABEL(AMD_exit_more_16): shr $16, %edx test %dl, %dl jz LABEL(AMD_exit_more_24) test $0x01, %dl jnz LABEL(tail_16) test $0x02, %dl jnz LABEL(tail_17) test $0x04, %dl jnz LABEL(tail_18) test $0x08, %dl jnz LABEL(tail_19) test $0x10, %dl jnz LABEL(tail_20) test $0x20, %dl jnz LABEL(tail_21) test $0x40, %dl jnz LABEL(tail_22) .p2align 4 LABEL(tail_23): /* 24 bytes */ mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) mov 16(%rsi), %rcx mov %rcx, 16(%rdi) #ifdef USE_AS_STRNCPY mov $24, %cl sub $24, %r8 jnz LABEL(strncpy_fill_tail) #endif ret .p2align 4 LABEL(tail_16): /* 17 bytes */ mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) mov 16(%rsi), %cl mov %cl, 16(%rdi) #ifdef USE_AS_STRNCPY mov $17, %cl sub $17, %r8 jnz LABEL(strncpy_fill_tail) #endif ret .p2align 4 LABEL(tail_17): /* 18 bytes */ mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) mov 16(%rsi), %cx mov %cx, 16(%rdi) #ifdef USE_AS_STRNCPY mov $18, %cl sub $18, %r8 jnz LABEL(strncpy_fill_tail) #endif ret .p2align 4 LABEL(tail_18): /* 19 bytes */ mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) mov 15(%rsi), %ecx mov %ecx,15(%rdi) #ifdef USE_AS_STRNCPY mov $19, %cl sub $19, %r8 jnz LABEL(strncpy_fill_tail) #endif ret .p2align 4 LABEL(tail_19): /* 20 bytes */ mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) mov 16(%rsi), %ecx mov %ecx, 16(%rdi) #ifdef USE_AS_STRNCPY mov $20, %cl sub $20, %r8 jnz LABEL(strncpy_fill_tail) #endif ret .p2align 4 LABEL(tail_20): /* 21 bytes */ mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) mov 13(%rsi), %rcx mov %rcx, 13(%rdi) #ifdef USE_AS_STRNCPY mov $21, %cl sub $21, %r8 jnz LABEL(strncpy_fill_tail) #endif ret .p2align 4 LABEL(tail_21): /* 22 bytes */ mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) mov 14(%rsi), %rcx mov %rcx, 14(%rdi) #ifdef USE_AS_STRNCPY mov $22, %cl sub $22, %r8 jnz LABEL(strncpy_fill_tail) #endif ret .p2align 4 LABEL(tail_22): /* 23 bytes */ mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) mov 15(%rsi), %rcx mov %rcx, 15(%rdi) #ifdef USE_AS_STRNCPY mov $23, %cl sub $23, %r8 jnz LABEL(strncpy_fill_tail) #endif ret .p2align 4 LABEL(AMD_exit_more_24): test $0x01, %dh jnz LABEL(tail_24) test $0x02, %dh jnz LABEL(tail_25) test $0x04, %dh jnz LABEL(tail_26) test $0x08, %dh jnz LABEL(tail_27) test $0x10, %dh jnz LABEL(tail_28) test $0x20, %dh jnz LABEL(tail_29) test $0x40, %dh jnz LABEL(tail_30) .p2align 4 LABEL(tail_31): /* 32 bytes */ mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) mov 16(%rsi), %rcx mov %rcx, 16(%rdi) mov 24(%rsi), %rdx mov %rdx, 24(%rdi) #ifdef USE_AS_STRNCPY mov $32, %cl sub $32, %r8 jnz LABEL(strncpy_fill_tail) #endif ret .p2align 4 LABEL(tail_24): /* 25 bytes */ mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) mov 16(%rsi), %rcx mov %rcx, 16(%rdi) mov 21(%rsi), %edx mov %edx, 21(%rdi) #ifdef USE_AS_STRNCPY mov $25, %cl sub $25, %r8 jnz LABEL(strncpy_fill_tail) #endif ret .p2align 4 LABEL(tail_25): /* 26 bytes */ mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) mov 16(%rsi), %rcx mov %rcx, 16(%rdi) mov 22(%rsi), %edx mov %edx, 22(%rdi) #ifdef USE_AS_STRNCPY mov $26, %cl sub $26, %r8 jnz LABEL(strncpy_fill_tail) #endif ret .p2align 4 LABEL(tail_26): /* 27 bytes */ mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) mov 16(%rsi), %rcx mov %rcx, 16(%rdi) mov 23(%rsi), %edx mov %edx, 23(%rdi) #ifdef USE_AS_STRNCPY mov $27, %cl sub $27, %r8 jnz LABEL(strncpy_fill_tail) #endif ret .p2align 4 LABEL(tail_27): /* 28 bytes */ mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) mov 16(%rsi), %rcx mov %rcx, 16(%rdi) mov 24(%rsi), %edx mov %edx, 24(%rdi) #ifdef USE_AS_STRNCPY mov $28, %cl sub $28, %r8 jnz LABEL(strncpy_fill_tail) #endif ret .p2align 4 LABEL(tail_28): /* 29 bytes */ mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) mov 16(%rsi), %rcx mov %rcx, 16(%rdi) mov 21(%rsi), %rdx mov %rdx, 21(%rdi) #ifdef USE_AS_STRNCPY mov $29, %cl sub $29, %r8 jnz LABEL(strncpy_fill_tail) #endif ret .p2align 4 LABEL(tail_29): /* 30 bytes */ mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) mov 16(%rsi), %rcx mov %rcx, 16(%rdi) mov 22(%rsi), %rdx mov %rdx, 22(%rdi) #ifdef USE_AS_STRNCPY mov $30, %cl sub $30, %r8 jnz LABEL(strncpy_fill_tail) #endif ret .p2align 4 LABEL(tail_30): /* 31 bytes */ mov (%rsi), %rcx mov %rcx, (%rdi) mov 8(%rsi), %rdx mov %rdx, 8(%rdi) mov 16(%rsi), %rcx mov %rcx, 16(%rdi) mov 23(%rsi), %rdx mov %rdx, 23(%rdi) #ifdef USE_AS_STRNCPY mov $31, %cl sub $31, %r8 jnz LABEL(strncpy_fill_tail) #endif ret .pushsection .rodata .p2align 4 LABEL(tail_table): .int LABEL(tail_0) - LABEL(tail_table) /* 1 byte */ .int LABEL(tail_1) - LABEL(tail_table) .int LABEL(tail_2) - LABEL(tail_table) .int LABEL(tail_3) - LABEL(tail_table) .int LABEL(tail_4) - LABEL(tail_table) .int LABEL(tail_5) - LABEL(tail_table) .int LABEL(tail_6) - LABEL(tail_table) .int LABEL(tail_7) - LABEL(tail_table) .int LABEL(tail_8) - LABEL(tail_table) .int LABEL(tail_9) - LABEL(tail_table) .int LABEL(tail_10) - LABEL(tail_table) .int LABEL(tail_11) - LABEL(tail_table) .int LABEL(tail_12) - LABEL(tail_table) .int LABEL(tail_13) - LABEL(tail_table) .int LABEL(tail_14) - LABEL(tail_table) .int LABEL(tail_15) - LABEL(tail_table) .int LABEL(tail_16) - LABEL(tail_table) .int LABEL(tail_17) - LABEL(tail_table) .int LABEL(tail_18) - LABEL(tail_table) .int LABEL(tail_19) - LABEL(tail_table) .int LABEL(tail_20) - LABEL(tail_table) .int LABEL(tail_21) - LABEL(tail_table) .int LABEL(tail_22) - LABEL(tail_table) .int LABEL(tail_23) - LABEL(tail_table) .int LABEL(tail_24) - LABEL(tail_table) .int LABEL(tail_25) - LABEL(tail_table) .int LABEL(tail_26) - LABEL(tail_table) .int LABEL(tail_27) - LABEL(tail_table) .int LABEL(tail_28) - LABEL(tail_table) .int LABEL(tail_29) - LABEL(tail_table) .int LABEL(tail_30) - LABEL(tail_table) .int LABEL(tail_31) - LABEL(tail_table) /* 32 bytes */ .p2align 4 LABEL(unaligned_table): .int LABEL(ashr_0) - LABEL(unaligned_table) .int LABEL(ashr_1) - LABEL(unaligned_table) .int LABEL(ashr_2) - LABEL(unaligned_table) .int LABEL(ashr_3) - LABEL(unaligned_table) .int LABEL(ashr_4) - LABEL(unaligned_table) .int LABEL(ashr_5) - LABEL(unaligned_table) .int LABEL(ashr_6) - LABEL(unaligned_table) .int LABEL(ashr_7) - LABEL(unaligned_table) .int LABEL(ashr_8) - LABEL(unaligned_table) .int LABEL(ashr_9) - LABEL(unaligned_table) .int LABEL(ashr_10) - LABEL(unaligned_table) .int LABEL(ashr_11) - LABEL(unaligned_table) .int LABEL(ashr_12) - LABEL(unaligned_table) .int LABEL(ashr_13) - LABEL(unaligned_table) .int LABEL(ashr_14) - LABEL(unaligned_table) .int LABEL(ashr_15) - LABEL(unaligned_table) .popsection #ifdef USE_AS_STRNCPY SET_SIZE(strncpy) #else SET_SIZE(strcpy) /* (char *, const char *) */ #endif