/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2008, Intel Corporation * All rights reserved. */ /* * memcpy.s - copies two blocks of memory * Implements memcpy() and memmove() libc primitives. */ .file "memcpy.s" #include ANSI_PRAGMA_WEAK(memmove,function) ANSI_PRAGMA_WEAK(memcpy,function) #include "cache.h" #include "proc64_id.h" #define L(s) .memcpy/**/s /* * memcpy algorithm overview: * * Thresholds used below were determined experimentally. * * Pseudo code: * * NOTE: On AMD NO_SSE is always set. Performance on Opteron did not improve * using 16-byte stores. Setting NO_SSE on AMD should be re-evaluated on * future AMD processors. * * * If (size <= 128 bytes) { * do unrolled code (primarily 8-byte loads/stores) regardless of * alignment. * } else { * Align destination to 16-byte boundary * * if (NO_SSE) { * If (size > half of the largest level cache) { * Use 8-byte non-temporal stores (64-bytes/loop) * } else { * if (size > 4K && size <= half l1 cache size) { * Use rep movsq * } else { * Use 8-byte loads/stores (64 bytes per loop) * } * } * * } else { **USE SSE** * If (size > half of the largest level cache) { * Use 16-byte non-temporal stores (128-bytes per loop) * } else { * If (both source and destination are aligned) { * Use 16-byte aligned loads and stores (128 bytes/loop) * } else { * use pairs of xmm registers with SSE2 or SSSE3 * instructions to concatenate and shift appropriately * to account for source unalignment. This enables * 16-byte aligned loads to be done. * } * } } * * Finish any remaining bytes via unrolled code above. * } * * memmove overview: * memmove is the same as memcpy except one case where copy needs to be * done backwards. The copy backwards code is done in a similar manner. */ ENTRY(memmove) cmp %rsi,%rdi # if dst <= src jbe L(CopyForward) # then do copy forward mov %rsi,%r9 # move src to r9 add %rdx,%r9 # add len to get addr of end of src cmp %r9,%rdi # if dst < end of src jb L(CopyBackwards) # then do copy backwards jmp L(CopyForward) ENTRY (memcpy) L(CopyForward): mov %rdx,%r8 mov %rdi,%rcx mov %rsi,%rdx mov %rdi,%rax lea L(fwdPxQx)(%rip),%r11 cmp $0x80,%r8 # 128 jg L(ck_use_sse2) add %r8,%rcx add %r8,%rdx movslq (%r11,%r8,4),%r10 lea (%r10,%r11,1),%r11 jmpq *%r11 .balign 16 L(ShrtAlignNew): lea L(AliPxQx)(%rip),%r11 mov %rcx,%r9 and $0xf,%r9 movslq (%r11,%r9,4),%r10 lea (%r10,%r11,1),%r11 jmpq *%r11 .balign 16 L(fwdPxQx): .int L(P0Q0)-L(fwdPxQx) .int L(P1Q0)-L(fwdPxQx) .int L(P2Q0)-L(fwdPxQx) .int L(P3Q0)-L(fwdPxQx) .int L(P4Q0)-L(fwdPxQx) .int L(P5Q0)-L(fwdPxQx) .int L(P6Q0)-L(fwdPxQx) .int L(P7Q0)-L(fwdPxQx) .int L(P0Q1)-L(fwdPxQx) .int L(P1Q1)-L(fwdPxQx) .int L(P2Q1)-L(fwdPxQx) .int L(P3Q1)-L(fwdPxQx) .int L(P4Q1)-L(fwdPxQx) .int L(P5Q1)-L(fwdPxQx) .int L(P6Q1)-L(fwdPxQx) .int L(P7Q1)-L(fwdPxQx) .int L(P0Q2)-L(fwdPxQx) .int L(P1Q2)-L(fwdPxQx) .int L(P2Q2)-L(fwdPxQx) .int L(P3Q2)-L(fwdPxQx) .int L(P4Q2)-L(fwdPxQx) .int L(P5Q2)-L(fwdPxQx) .int L(P6Q2)-L(fwdPxQx) .int L(P7Q2)-L(fwdPxQx) .int L(P0Q3)-L(fwdPxQx) .int L(P1Q3)-L(fwdPxQx) .int L(P2Q3)-L(fwdPxQx) .int L(P3Q3)-L(fwdPxQx) .int L(P4Q3)-L(fwdPxQx) .int L(P5Q3)-L(fwdPxQx) .int L(P6Q3)-L(fwdPxQx) .int L(P7Q3)-L(fwdPxQx) .int L(P0Q4)-L(fwdPxQx) .int L(P1Q4)-L(fwdPxQx) .int L(P2Q4)-L(fwdPxQx) .int L(P3Q4)-L(fwdPxQx) .int L(P4Q4)-L(fwdPxQx) .int L(P5Q4)-L(fwdPxQx) .int L(P6Q4)-L(fwdPxQx) .int L(P7Q4)-L(fwdPxQx) .int L(P0Q5)-L(fwdPxQx) .int L(P1Q5)-L(fwdPxQx) .int L(P2Q5)-L(fwdPxQx) .int L(P3Q5)-L(fwdPxQx) .int L(P4Q5)-L(fwdPxQx) .int L(P5Q5)-L(fwdPxQx) .int L(P6Q5)-L(fwdPxQx) .int L(P7Q5)-L(fwdPxQx) .int L(P0Q6)-L(fwdPxQx) .int L(P1Q6)-L(fwdPxQx) .int L(P2Q6)-L(fwdPxQx) .int L(P3Q6)-L(fwdPxQx) .int L(P4Q6)-L(fwdPxQx) .int L(P5Q6)-L(fwdPxQx) .int L(P6Q6)-L(fwdPxQx) .int L(P7Q6)-L(fwdPxQx) .int L(P0Q7)-L(fwdPxQx) .int L(P1Q7)-L(fwdPxQx) .int L(P2Q7)-L(fwdPxQx) .int L(P3Q7)-L(fwdPxQx) .int L(P4Q7)-L(fwdPxQx) .int L(P5Q7)-L(fwdPxQx) .int L(P6Q7)-L(fwdPxQx) .int L(P7Q7)-L(fwdPxQx) .int L(P0Q8)-L(fwdPxQx) .int L(P1Q8)-L(fwdPxQx) .int L(P2Q8)-L(fwdPxQx) .int L(P3Q8)-L(fwdPxQx) .int L(P4Q8)-L(fwdPxQx) .int L(P5Q8)-L(fwdPxQx) .int L(P6Q8)-L(fwdPxQx) .int L(P7Q8)-L(fwdPxQx) .int L(P0Q9)-L(fwdPxQx) .int L(P1Q9)-L(fwdPxQx) .int L(P2Q9)-L(fwdPxQx) .int L(P3Q9)-L(fwdPxQx) .int L(P4Q9)-L(fwdPxQx) .int L(P5Q9)-L(fwdPxQx) .int L(P6Q9)-L(fwdPxQx) .int L(P7Q9)-L(fwdPxQx) .int L(P0QA)-L(fwdPxQx) .int L(P1QA)-L(fwdPxQx) .int L(P2QA)-L(fwdPxQx) .int L(P3QA)-L(fwdPxQx) .int L(P4QA)-L(fwdPxQx) .int L(P5QA)-L(fwdPxQx) .int L(P6QA)-L(fwdPxQx) .int L(P7QA)-L(fwdPxQx) .int L(P0QB)-L(fwdPxQx) .int L(P1QB)-L(fwdPxQx) .int L(P2QB)-L(fwdPxQx) .int L(P3QB)-L(fwdPxQx) .int L(P4QB)-L(fwdPxQx) .int L(P5QB)-L(fwdPxQx) .int L(P6QB)-L(fwdPxQx) .int L(P7QB)-L(fwdPxQx) .int L(P0QC)-L(fwdPxQx) .int L(P1QC)-L(fwdPxQx) .int L(P2QC)-L(fwdPxQx) .int L(P3QC)-L(fwdPxQx) .int L(P4QC)-L(fwdPxQx) .int L(P5QC)-L(fwdPxQx) .int L(P6QC)-L(fwdPxQx) .int L(P7QC)-L(fwdPxQx) .int L(P0QD)-L(fwdPxQx) .int L(P1QD)-L(fwdPxQx) .int L(P2QD)-L(fwdPxQx) .int L(P3QD)-L(fwdPxQx) .int L(P4QD)-L(fwdPxQx) .int L(P5QD)-L(fwdPxQx) .int L(P6QD)-L(fwdPxQx) .int L(P7QD)-L(fwdPxQx) .int L(P0QE)-L(fwdPxQx) .int L(P1QE)-L(fwdPxQx) .int L(P2QE)-L(fwdPxQx) .int L(P3QE)-L(fwdPxQx) .int L(P4QE)-L(fwdPxQx) .int L(P5QE)-L(fwdPxQx) .int L(P6QE)-L(fwdPxQx) .int L(P7QE)-L(fwdPxQx) .int L(P0QF)-L(fwdPxQx) .int L(P1QF)-L(fwdPxQx) .int L(P2QF)-L(fwdPxQx) .int L(P3QF)-L(fwdPxQx) .int L(P4QF)-L(fwdPxQx) .int L(P5QF)-L(fwdPxQx) .int L(P6QF)-L(fwdPxQx) .int L(P7QF)-L(fwdPxQx) .int L(P0QG)-L(fwdPxQx) # 0x80 .balign 16 L(AliPxQx): .int L(now_qw_aligned)-L(AliPxQx) .int L(A1Q0)-L(AliPxQx) .int L(A2Q0)-L(AliPxQx) .int L(A3Q0)-L(AliPxQx) .int L(A4Q0)-L(AliPxQx) .int L(A5Q0)-L(AliPxQx) .int L(A6Q0)-L(AliPxQx) .int L(A7Q0)-L(AliPxQx) .int L(A0Q1)-L(AliPxQx) .int L(A1Q1)-L(AliPxQx) .int L(A2Q1)-L(AliPxQx) .int L(A3Q1)-L(AliPxQx) .int L(A4Q1)-L(AliPxQx) .int L(A5Q1)-L(AliPxQx) .int L(A6Q1)-L(AliPxQx) .int L(A7Q1)-L(AliPxQx) .balign 16 L(A1Q0): # ; need to move 8+ 7=1+2+4 bytes movzbq (%rdx),%r11 sub $0xf,%r8 mov %r11b,(%rcx) movzwq 0x1(%rdx),%r10 mov %r10w,0x1(%rcx) mov 0x3(%rdx),%r9d mov %r9d,0x3(%rcx) mov 0x7(%rdx),%r11 add $0xf,%rdx mov %r11,0x7(%rcx) add $0xf,%rcx jmp L(now_qw_aligned) .balign 16 L(A2Q0): # ; need to move 8+ 6=2+4 bytes movzwq (%rdx),%r10 sub $0xe,%r8 mov %r10w,(%rcx) mov 0x2(%rdx),%r9d mov %r9d,0x2(%rcx) mov 0x6(%rdx),%r11 add $0xe,%rdx mov %r11,0x6(%rcx) add $0xe,%rcx jmp L(now_qw_aligned) .balign 16 L(A3Q0): # ; need to move 8+ 5=1+4 bytes movzbq (%rdx),%r11 sub $0xd,%r8 mov %r11b,(%rcx) mov 0x1(%rdx),%r9d mov %r9d,0x1(%rcx) mov 0x5(%rdx),%r10 add $0xd,%rdx mov %r10,0x5(%rcx) add $0xd,%rcx jmp L(now_qw_aligned) .balign 16 L(A4Q0): # ; need to move 8+4 bytes mov (%rdx),%r9d sub $0xc,%r8 mov %r9d,(%rcx) mov 0x4(%rdx),%r10 add $0xc,%rdx mov %r10,0x4(%rcx) add $0xc,%rcx jmp L(now_qw_aligned) .balign 16 L(A5Q0): # ; need to move 8+ 3=1+2 bytes movzbq (%rdx),%r11 sub $0xb,%r8 mov %r11b,(%rcx) movzwq 0x1(%rdx),%r10 mov %r10w,0x1(%rcx) mov 0x3(%rdx),%r9 add $0xb,%rdx mov %r9,0x3(%rcx) add $0xb,%rcx jmp L(now_qw_aligned) .balign 16 L(A6Q0): # ; need to move 8+2 bytes movzwq (%rdx),%r10 sub $0xa,%r8 mov %r10w,(%rcx) mov 0x2(%rdx),%r9 add $0xa,%rdx mov %r9,0x2(%rcx) add $0xa,%rcx jmp L(now_qw_aligned) .balign 16 L(A7Q0): # ; need to move 8+1 byte movzbq (%rdx),%r11 sub $0x9,%r8 mov %r11b,(%rcx) mov 0x1(%rdx),%r10 add $0x9,%rdx mov %r10,0x1(%rcx) add $0x9,%rcx jmp L(now_qw_aligned) .balign 16 L(A0Q1): # ; need to move 8 bytes mov (%rdx),%r10 add $0x8,%rdx sub $0x8,%r8 mov %r10,(%rcx) add $0x8,%rcx jmp L(now_qw_aligned) .balign 16 L(A1Q1): # ; need to move 7=1+2+4 bytes movzbq (%rdx),%r11 sub $0x7,%r8 mov %r11b,(%rcx) movzwq 0x1(%rdx),%r10 mov %r10w,0x1(%rcx) mov 0x3(%rdx),%r9d add $0x7,%rdx mov %r9d,0x3(%rcx) add $0x7,%rcx jmp L(now_qw_aligned) .balign 16 L(A2Q1): # ; need to move 6=2+4 bytes movzwq (%rdx),%r10 sub $0x6,%r8 mov %r10w,(%rcx) mov 0x2(%rdx),%r9d add $0x6,%rdx mov %r9d,0x2(%rcx) add $0x6,%rcx jmp L(now_qw_aligned) .balign 16 L(A3Q1): # ; need to move 5=1+4 bytes movzbq (%rdx),%r11 sub $0x5,%r8 mov %r11b,(%rcx) mov 0x1(%rdx),%r9d add $0x5,%rdx mov %r9d,0x1(%rcx) add $0x5,%rcx jmp L(now_qw_aligned) .balign 16 L(A4Q1): # ; need to move 4 bytes mov (%rdx),%r9d sub $0x4,%r8 add $0x4,%rdx mov %r9d,(%rcx) add $0x4,%rcx jmp L(now_qw_aligned) .balign 16 L(A5Q1): # ; need to move 3=1+2 bytes movzbq (%rdx),%r11 sub $0x3,%r8 mov %r11b,(%rcx) movzwq 0x1(%rdx),%r10 add $0x3,%rdx mov %r10w,0x1(%rcx) add $0x3,%rcx jmp L(now_qw_aligned) .balign 16 L(A6Q1): # ; need to move 2 bytes movzwq (%rdx),%r10 sub $0x2,%r8 add $0x2,%rdx mov %r10w,(%rcx) add $0x2,%rcx jmp L(now_qw_aligned) .balign 16 L(A7Q1): # ; need to move 1 byte movzbq (%rdx),%r11 dec %r8 inc %rdx mov %r11b,(%rcx) inc %rcx jmp L(now_qw_aligned) .balign 16 L(P0QG): mov -0x80(%rdx),%r9 mov %r9,-0x80(%rcx) L(P0QF): mov -0x78(%rdx),%r10 mov %r10,-0x78(%rcx) L(P0QE): mov -0x70(%rdx),%r9 mov %r9,-0x70(%rcx) L(P0QD): mov -0x68(%rdx),%r10 mov %r10,-0x68(%rcx) L(P0QC): mov -0x60(%rdx),%r9 mov %r9,-0x60(%rcx) L(P0QB): mov -0x58(%rdx),%r10 mov %r10,-0x58(%rcx) L(P0QA): mov -0x50(%rdx),%r9 mov %r9,-0x50(%rcx) L(P0Q9): mov -0x48(%rdx),%r10 mov %r10,-0x48(%rcx) L(P0Q8): mov -0x40(%rdx),%r9 mov %r9,-0x40(%rcx) L(P0Q7): mov -0x38(%rdx),%r10 mov %r10,-0x38(%rcx) L(P0Q6): mov -0x30(%rdx),%r9 mov %r9,-0x30(%rcx) L(P0Q5): mov -0x28(%rdx),%r10 mov %r10,-0x28(%rcx) L(P0Q4): mov -0x20(%rdx),%r9 mov %r9,-0x20(%rcx) L(P0Q3): mov -0x18(%rdx),%r10 mov %r10,-0x18(%rcx) L(P0Q2): mov -0x10(%rdx),%r9 mov %r9,-0x10(%rcx) L(P0Q1): mov -0x8(%rdx),%r10 mov %r10,-0x8(%rcx) L(P0Q0): ret .balign 16 L(P1QF): mov -0x79(%rdx),%r9 mov %r9,-0x79(%rcx) L(P1QE): mov -0x71(%rdx),%r11 mov %r11,-0x71(%rcx) L(P1QD): mov -0x69(%rdx),%r10 mov %r10,-0x69(%rcx) L(P1QC): mov -0x61(%rdx),%r9 mov %r9,-0x61(%rcx) L(P1QB): mov -0x59(%rdx),%r11 mov %r11,-0x59(%rcx) L(P1QA): mov -0x51(%rdx),%r10 mov %r10,-0x51(%rcx) L(P1Q9): mov -0x49(%rdx),%r9 mov %r9,-0x49(%rcx) L(P1Q8): mov -0x41(%rdx),%r11 mov %r11,-0x41(%rcx) L(P1Q7): mov -0x39(%rdx),%r10 mov %r10,-0x39(%rcx) L(P1Q6): mov -0x31(%rdx),%r9 mov %r9,-0x31(%rcx) L(P1Q5): mov -0x29(%rdx),%r11 mov %r11,-0x29(%rcx) L(P1Q4): mov -0x21(%rdx),%r10 mov %r10,-0x21(%rcx) L(P1Q3): mov -0x19(%rdx),%r9 mov %r9,-0x19(%rcx) L(P1Q2): mov -0x11(%rdx),%r11 mov %r11,-0x11(%rcx) L(P1Q1): mov -0x9(%rdx),%r10 mov %r10,-0x9(%rcx) L(P1Q0): movzbq -0x1(%rdx),%r9 mov %r9b,-0x1(%rcx) ret .balign 16 L(P2QF): mov -0x7a(%rdx),%r9 mov %r9,-0x7a(%rcx) L(P2QE): mov -0x72(%rdx),%r11 mov %r11,-0x72(%rcx) L(P2QD): mov -0x6a(%rdx),%r10 mov %r10,-0x6a(%rcx) L(P2QC): mov -0x62(%rdx),%r9 mov %r9,-0x62(%rcx) L(P2QB): mov -0x5a(%rdx),%r11 mov %r11,-0x5a(%rcx) L(P2QA): mov -0x52(%rdx),%r10 mov %r10,-0x52(%rcx) L(P2Q9): mov -0x4a(%rdx),%r9 mov %r9,-0x4a(%rcx) L(P2Q8): mov -0x42(%rdx),%r11 mov %r11,-0x42(%rcx) L(P2Q7): mov -0x3a(%rdx),%r10 mov %r10,-0x3a(%rcx) L(P2Q6): mov -0x32(%rdx),%r9 mov %r9,-0x32(%rcx) L(P2Q5): mov -0x2a(%rdx),%r11 mov %r11,-0x2a(%rcx) L(P2Q4): mov -0x22(%rdx),%r10 mov %r10,-0x22(%rcx) L(P2Q3): mov -0x1a(%rdx),%r9 mov %r9,-0x1a(%rcx) L(P2Q2): mov -0x12(%rdx),%r11 mov %r11,-0x12(%rcx) L(P2Q1): mov -0xa(%rdx),%r10 mov %r10,-0xa(%rcx) L(P2Q0): movzwq -0x2(%rdx),%r9 mov %r9w,-0x2(%rcx) ret .balign 16 L(P3QF): mov -0x7b(%rdx),%r9 mov %r9,-0x7b(%rcx) L(P3QE): mov -0x73(%rdx),%r11 mov %r11,-0x73(%rcx) L(P3QD): mov -0x6b(%rdx),%r10 mov %r10,-0x6b(%rcx) L(P3QC): mov -0x63(%rdx),%r9 mov %r9,-0x63(%rcx) L(P3QB): mov -0x5b(%rdx),%r11 mov %r11,-0x5b(%rcx) L(P3QA): mov -0x53(%rdx),%r10 mov %r10,-0x53(%rcx) L(P3Q9): mov -0x4b(%rdx),%r9 mov %r9,-0x4b(%rcx) L(P3Q8): mov -0x43(%rdx),%r11 mov %r11,-0x43(%rcx) L(P3Q7): mov -0x3b(%rdx),%r10 mov %r10,-0x3b(%rcx) L(P3Q6): mov -0x33(%rdx),%r9 mov %r9,-0x33(%rcx) L(P3Q5): mov -0x2b(%rdx),%r11 mov %r11,-0x2b(%rcx) L(P3Q4): mov -0x23(%rdx),%r10 mov %r10,-0x23(%rcx) L(P3Q3): mov -0x1b(%rdx),%r9 mov %r9,-0x1b(%rcx) L(P3Q2): mov -0x13(%rdx),%r11 mov %r11,-0x13(%rcx) L(P3Q1): mov -0xb(%rdx),%r10 mov %r10,-0xb(%rcx) /* * These trailing loads/stores have to do all their loads 1st, * then do the stores. */ L(P3Q0): movzwq -0x3(%rdx),%r9 movzbq -0x1(%rdx),%r10 mov %r9w,-0x3(%rcx) mov %r10b,-0x1(%rcx) ret .balign 16 L(P4QF): mov -0x7c(%rdx),%r9 mov %r9,-0x7c(%rcx) L(P4QE): mov -0x74(%rdx),%r11 mov %r11,-0x74(%rcx) L(P4QD): mov -0x6c(%rdx),%r10 mov %r10,-0x6c(%rcx) L(P4QC): mov -0x64(%rdx),%r9 mov %r9,-0x64(%rcx) L(P4QB): mov -0x5c(%rdx),%r11 mov %r11,-0x5c(%rcx) L(P4QA): mov -0x54(%rdx),%r10 mov %r10,-0x54(%rcx) L(P4Q9): mov -0x4c(%rdx),%r9 mov %r9,-0x4c(%rcx) L(P4Q8): mov -0x44(%rdx),%r11 mov %r11,-0x44(%rcx) L(P4Q7): mov -0x3c(%rdx),%r10 mov %r10,-0x3c(%rcx) L(P4Q6): mov -0x34(%rdx),%r9 mov %r9,-0x34(%rcx) L(P4Q5): mov -0x2c(%rdx),%r11 mov %r11,-0x2c(%rcx) L(P4Q4): mov -0x24(%rdx),%r10 mov %r10,-0x24(%rcx) L(P4Q3): mov -0x1c(%rdx),%r9 mov %r9,-0x1c(%rcx) L(P4Q2): mov -0x14(%rdx),%r11 mov %r11,-0x14(%rcx) L(P4Q1): mov -0xc(%rdx),%r10 mov %r10,-0xc(%rcx) L(P4Q0): mov -0x4(%rdx),%r9d mov %r9d,-0x4(%rcx) ret .balign 16 L(P5QF): mov -0x7d(%rdx),%r9 mov %r9,-0x7d(%rcx) L(P5QE): mov -0x75(%rdx),%r11 mov %r11,-0x75(%rcx) L(P5QD): mov -0x6d(%rdx),%r10 mov %r10,-0x6d(%rcx) L(P5QC): mov -0x65(%rdx),%r9 mov %r9,-0x65(%rcx) L(P5QB): mov -0x5d(%rdx),%r11 mov %r11,-0x5d(%rcx) L(P5QA): mov -0x55(%rdx),%r10 mov %r10,-0x55(%rcx) L(P5Q9): mov -0x4d(%rdx),%r9 mov %r9,-0x4d(%rcx) L(P5Q8): mov -0x45(%rdx),%r11 mov %r11,-0x45(%rcx) L(P5Q7): mov -0x3d(%rdx),%r10 mov %r10,-0x3d(%rcx) L(P5Q6): mov -0x35(%rdx),%r9 mov %r9,-0x35(%rcx) L(P5Q5): mov -0x2d(%rdx),%r11 mov %r11,-0x2d(%rcx) L(P5Q4): mov -0x25(%rdx),%r10 mov %r10,-0x25(%rcx) L(P5Q3): mov -0x1d(%rdx),%r9 mov %r9,-0x1d(%rcx) L(P5Q2): mov -0x15(%rdx),%r11 mov %r11,-0x15(%rcx) L(P5Q1): mov -0xd(%rdx),%r10 mov %r10,-0xd(%rcx) /* * These trailing loads/stores have to do all their loads 1st, * then do the stores. */ L(P5Q0): mov -0x5(%rdx),%r9d movzbq -0x1(%rdx),%r10 mov %r9d,-0x5(%rcx) mov %r10b,-0x1(%rcx) ret .balign 16 L(P6QF): mov -0x7e(%rdx),%r9 mov %r9,-0x7e(%rcx) L(P6QE): mov -0x76(%rdx),%r11 mov %r11,-0x76(%rcx) L(P6QD): mov -0x6e(%rdx),%r10 mov %r10,-0x6e(%rcx) L(P6QC): mov -0x66(%rdx),%r9 mov %r9,-0x66(%rcx) L(P6QB): mov -0x5e(%rdx),%r11 mov %r11,-0x5e(%rcx) L(P6QA): mov -0x56(%rdx),%r10 mov %r10,-0x56(%rcx) L(P6Q9): mov -0x4e(%rdx),%r9 mov %r9,-0x4e(%rcx) L(P6Q8): mov -0x46(%rdx),%r11 mov %r11,-0x46(%rcx) L(P6Q7): mov -0x3e(%rdx),%r10 mov %r10,-0x3e(%rcx) L(P6Q6): mov -0x36(%rdx),%r9 mov %r9,-0x36(%rcx) L(P6Q5): mov -0x2e(%rdx),%r11 mov %r11,-0x2e(%rcx) L(P6Q4): mov -0x26(%rdx),%r10 mov %r10,-0x26(%rcx) L(P6Q3): mov -0x1e(%rdx),%r9 mov %r9,-0x1e(%rcx) L(P6Q2): mov -0x16(%rdx),%r11 mov %r11,-0x16(%rcx) L(P6Q1): mov -0xe(%rdx),%r10 mov %r10,-0xe(%rcx) /* * These trailing loads/stores have to do all their loads 1st, * then do the stores. */ L(P6Q0): mov -0x6(%rdx),%r9d movzwq -0x2(%rdx),%r10 mov %r9d,-0x6(%rcx) mov %r10w,-0x2(%rcx) ret .balign 16 L(P7QF): mov -0x7f(%rdx),%r9 mov %r9,-0x7f(%rcx) L(P7QE): mov -0x77(%rdx),%r11 mov %r11,-0x77(%rcx) L(P7QD): mov -0x6f(%rdx),%r10 mov %r10,-0x6f(%rcx) L(P7QC): mov -0x67(%rdx),%r9 mov %r9,-0x67(%rcx) L(P7QB): mov -0x5f(%rdx),%r11 mov %r11,-0x5f(%rcx) L(P7QA): mov -0x57(%rdx),%r10 mov %r10,-0x57(%rcx) L(P7Q9): mov -0x4f(%rdx),%r9 mov %r9,-0x4f(%rcx) L(P7Q8): mov -0x47(%rdx),%r11 mov %r11,-0x47(%rcx) L(P7Q7): mov -0x3f(%rdx),%r10 mov %r10,-0x3f(%rcx) L(P7Q6): mov -0x37(%rdx),%r9 mov %r9,-0x37(%rcx) L(P7Q5): mov -0x2f(%rdx),%r11 mov %r11,-0x2f(%rcx) L(P7Q4): mov -0x27(%rdx),%r10 mov %r10,-0x27(%rcx) L(P7Q3): mov -0x1f(%rdx),%r9 mov %r9,-0x1f(%rcx) L(P7Q2): mov -0x17(%rdx),%r11 mov %r11,-0x17(%rcx) L(P7Q1): mov -0xf(%rdx),%r10 mov %r10,-0xf(%rcx) /* * These trailing loads/stores have to do all their loads 1st, * then do the stores. */ L(P7Q0): mov -0x7(%rdx),%r9d movzwq -0x3(%rdx),%r10 movzbq -0x1(%rdx),%r11 mov %r9d,-0x7(%rcx) mov %r10w,-0x3(%rcx) mov %r11b,-0x1(%rcx) ret .balign 16 L(ck_use_sse2): /* * Align dest to 16 byte boundary. */ test $0xf,%rcx jnz L(ShrtAlignNew) L(now_qw_aligned): cmpl $NO_SSE,.memops_method(%rip) je L(Loop8byte_pre) /* * The fall-through path is to do SSE2 16-byte load/stores */ /* * If current move size is larger than half of the highest level cache * size, then do non-temporal moves. */ mov .largest_level_cache_size(%rip),%r9d shr %r9 # take half of it cmp %r9,%r8 jg L(sse2_nt_move) /* * If both the source and dest are aligned, then use the both aligned * logic. Well aligned data should reap the rewards. */ test $0xf,%rdx jz L(pre_both_aligned) lea L(SSE_src)(%rip),%r10 # SSE2 (default) testl $USE_SSSE3,.memops_method(%rip) jz 1f lea L(SSSE3_src)(%rip),%r10 # SSSE3 1: /* * if the src is not 16 byte aligned... */ mov %rdx,%r11 and $0xf,%r11 movdqu (%rdx),%xmm0 movdqa %xmm0,(%rcx) add $0x10,%rdx sub %r11,%rdx add $0x10,%rcx sub $0x10,%r8 movdqa (%rdx),%xmm1 movslq (%r10,%r11,4),%r9 lea (%r9,%r10,1),%r10 jmpq *%r10 .balign 16 L(SSSE3_src): .int L(pre_both_aligned)-L(SSSE3_src) .int L(mov3dqa1) -L(SSSE3_src) .int L(mov3dqa2) -L(SSSE3_src) .int L(mov3dqa3) -L(SSSE3_src) .int L(mov3dqa4) -L(SSSE3_src) .int L(mov3dqa5) -L(SSSE3_src) .int L(mov3dqa6) -L(SSSE3_src) .int L(mov3dqa7) -L(SSSE3_src) .int L(movdqa8) -L(SSSE3_src) .int L(mov3dqa9) -L(SSSE3_src) .int L(mov3dqa10)-L(SSSE3_src) .int L(mov3dqa11)-L(SSSE3_src) .int L(mov3dqa12)-L(SSSE3_src) .int L(mov3dqa13)-L(SSSE3_src) .int L(mov3dqa14)-L(SSSE3_src) .int L(mov3dqa15)-L(SSSE3_src) L(SSE_src): .int L(pre_both_aligned)-L(SSE_src) .int L(movdqa1) -L(SSE_src) .int L(movdqa2) -L(SSE_src) .int L(movdqa3) -L(SSE_src) .int L(movdqa4) -L(SSE_src) .int L(movdqa5) -L(SSE_src) .int L(movdqa6) -L(SSE_src) .int L(movdqa7) -L(SSE_src) .int L(movdqa8) -L(SSE_src) .int L(movdqa9) -L(SSE_src) .int L(movdqa10)-L(SSE_src) .int L(movdqa11)-L(SSE_src) .int L(movdqa12)-L(SSE_src) .int L(movdqa13)-L(SSE_src) .int L(movdqa14)-L(SSE_src) .int L(movdqa15)-L(SSE_src) .balign 16 L(movdqa1): movdqa 0x10(%rdx),%xmm3 # load the upper source buffer movdqa 0x20(%rdx),%xmm0 # load the upper source buffer lea 0x20(%rdx),%rdx lea -0x20(%r8),%r8 psrldq $0x1,%xmm1 # shift right prev buffer (saved from last iteration) movdqa %xmm3,%xmm2 # store off xmm reg for use next iteration pslldq $0xf,%xmm3 # shift the current buffer left (shift in zeros) por %xmm1,%xmm3 # OR them together cmp $0x20,%r8 psrldq $0x1,%xmm2 # shift right prev buffer (saved from last iteration) movdqa %xmm0,%xmm1 # store off xmm reg for use next iteration pslldq $0xf,%xmm0 # shift the current buffer left (shift in zeros) por %xmm2,%xmm0 # OR them together movdqa %xmm3,(%rcx) # store it movdqa %xmm0,0x10(%rcx) # store it lea 0x20(%rcx),%rcx jge L(movdqa1) jmp L(movdqa_epi) .balign 16 L(movdqa2): sub $0x20,%r8 movdqa 0x10(%rdx),%xmm3 movdqa 0x20(%rdx),%xmm0 add $0x20,%rdx psrldq $0x2,%xmm1 movdqa %xmm3,%xmm2 pslldq $0xe,%xmm3 por %xmm1,%xmm3 psrldq $0x2,%xmm2 movdqa %xmm0,%xmm1 pslldq $0xe,%xmm0 por %xmm2,%xmm0 movdqa %xmm3,(%rcx) movdqa %xmm0,0x10(%rcx) add $0x20,%rcx cmp $0x20,%r8 jge L(movdqa2) jmp L(movdqa_epi) .balign 16 L(movdqa3): sub $0x20,%r8 movdqa 0x10(%rdx),%xmm3 movdqa 0x20(%rdx),%xmm0 add $0x20,%rdx psrldq $0x3,%xmm1 movdqa %xmm3,%xmm2 pslldq $0xd,%xmm3 por %xmm1,%xmm3 psrldq $0x3,%xmm2 movdqa %xmm0,%xmm1 pslldq $0xd,%xmm0 por %xmm2,%xmm0 movdqa %xmm3,(%rcx) movdqa %xmm0,0x10(%rcx) add $0x20,%rcx cmp $0x20,%r8 jge L(movdqa3) jmp L(movdqa_epi) .balign 16 L(movdqa4): sub $0x20,%r8 movdqa 0x10(%rdx),%xmm3 movdqa 0x20(%rdx),%xmm0 add $0x20,%rdx psrldq $0x4,%xmm1 movdqa %xmm3,%xmm2 pslldq $0xc,%xmm3 por %xmm1,%xmm3 psrldq $0x4,%xmm2 movdqa %xmm0,%xmm1 pslldq $0xc,%xmm0 por %xmm2,%xmm0 movdqa %xmm3,(%rcx) movdqa %xmm0,0x10(%rcx) add $0x20,%rcx cmp $0x20,%r8 jge L(movdqa4) jmp L(movdqa_epi) .balign 16 L(movdqa5): sub $0x20,%r8 movdqa 0x10(%rdx),%xmm3 movdqa 0x20(%rdx),%xmm0 add $0x20,%rdx psrldq $0x5,%xmm1 movdqa %xmm3,%xmm2 pslldq $0xb,%xmm3 por %xmm1,%xmm3 psrldq $0x5,%xmm2 movdqa %xmm0,%xmm1 pslldq $0xb,%xmm0 por %xmm2,%xmm0 movdqa %xmm3,(%rcx) movdqa %xmm0,0x10(%rcx) add $0x20,%rcx cmp $0x20,%r8 jge L(movdqa5) jmp L(movdqa_epi) .balign 16 L(movdqa6): sub $0x20,%r8 movdqa 0x10(%rdx),%xmm3 movdqa 0x20(%rdx),%xmm0 add $0x20,%rdx psrldq $0x6,%xmm1 movdqa %xmm3,%xmm2 pslldq $0xa,%xmm3 por %xmm1,%xmm3 psrldq $0x6,%xmm2 movdqa %xmm0,%xmm1 pslldq $0xa,%xmm0 por %xmm2,%xmm0 movdqa %xmm3,(%rcx) movdqa %xmm0,0x10(%rcx) add $0x20,%rcx cmp $0x20,%r8 jge L(movdqa6) jmp L(movdqa_epi) .balign 16 L(movdqa7): sub $0x20,%r8 movdqa 0x10(%rdx),%xmm3 movdqa 0x20(%rdx),%xmm0 add $0x20,%rdx psrldq $0x7,%xmm1 movdqa %xmm3,%xmm2 pslldq $0x9,%xmm3 por %xmm1,%xmm3 psrldq $0x7,%xmm2 movdqa %xmm0,%xmm1 pslldq $0x9,%xmm0 por %xmm2,%xmm0 movdqa %xmm3,(%rcx) movdqa %xmm0,0x10(%rcx) add $0x20,%rcx cmp $0x20,%r8 jge L(movdqa7) jmp L(movdqa_epi) .balign 16 L(movdqa8): movdqa 0x10(%rdx),%xmm3 sub $0x30,%r8 movdqa 0x20(%rdx),%xmm0 movdqa 0x30(%rdx),%xmm5 lea 0x30(%rdx),%rdx shufpd $0x1,%xmm3,%xmm1 movdqa %xmm1,(%rcx) cmp $0x30,%r8 shufpd $0x1,%xmm0,%xmm3 movdqa %xmm3,0x10(%rcx) movdqa %xmm5,%xmm1 shufpd $0x1,%xmm5,%xmm0 movdqa %xmm0,0x20(%rcx) lea 0x30(%rcx),%rcx jge L(movdqa8) jmp L(movdqa_epi) .balign 16 L(movdqa9): sub $0x20,%r8 movdqa 0x10(%rdx),%xmm3 movdqa 0x20(%rdx),%xmm0 add $0x20,%rdx psrldq $0x9,%xmm1 movdqa %xmm3,%xmm2 pslldq $0x7,%xmm3 por %xmm1,%xmm3 psrldq $0x9,%xmm2 movdqa %xmm0,%xmm1 pslldq $0x7,%xmm0 por %xmm2,%xmm0 movdqa %xmm3,(%rcx) movdqa %xmm0,0x10(%rcx) add $0x20,%rcx cmp $0x20,%r8 jge L(movdqa9) jmp L(movdqa_epi) .balign 16 L(movdqa10): sub $0x20,%r8 movdqa 0x10(%rdx),%xmm3 movdqa 0x20(%rdx),%xmm0 add $0x20,%rdx psrldq $0xa,%xmm1 movdqa %xmm3,%xmm2 pslldq $0x6,%xmm3 por %xmm1,%xmm3 psrldq $0xa,%xmm2 movdqa %xmm0,%xmm1 pslldq $0x6,%xmm0 por %xmm2,%xmm0 movdqa %xmm3,(%rcx) movdqa %xmm0,0x10(%rcx) add $0x20,%rcx cmp $0x20,%r8 jge L(movdqa10) jmp L(movdqa_epi) .balign 16 L(movdqa11): sub $0x20,%r8 movdqa 0x10(%rdx),%xmm3 movdqa 0x20(%rdx),%xmm0 add $0x20,%rdx psrldq $0xb,%xmm1 movdqa %xmm3,%xmm2 pslldq $0x5,%xmm3 por %xmm1,%xmm3 psrldq $0xb,%xmm2 movdqa %xmm0,%xmm1 pslldq $0x5,%xmm0 por %xmm2,%xmm0 movdqa %xmm3,(%rcx) movdqa %xmm0,0x10(%rcx) add $0x20,%rcx cmp $0x20,%r8 jge L(movdqa11) jmp L(movdqa_epi) .balign 16 L(movdqa12): sub $0x20,%r8 movdqa 0x10(%rdx),%xmm3 movdqa 0x20(%rdx),%xmm0 add $0x20,%rdx psrldq $0xc,%xmm1 movdqa %xmm3,%xmm2 pslldq $0x4,%xmm3 por %xmm1,%xmm3 psrldq $0xc,%xmm2 movdqa %xmm0,%xmm1 pslldq $0x4,%xmm0 por %xmm2,%xmm0 movdqa %xmm3,(%rcx) movdqa %xmm0,0x10(%rcx) add $0x20,%rcx cmp $0x20,%r8 jge L(movdqa12) jmp L(movdqa_epi) .balign 16 L(movdqa13): sub $0x20,%r8 movdqa 0x10(%rdx),%xmm3 movdqa 0x20(%rdx),%xmm0 add $0x20,%rdx psrldq $0xd,%xmm1 movdqa %xmm3,%xmm2 pslldq $0x3,%xmm3 por %xmm1,%xmm3 psrldq $0xd,%xmm2 movdqa %xmm0,%xmm1 pslldq $0x3,%xmm0 por %xmm2,%xmm0 movdqa %xmm3,(%rcx) movdqa %xmm0,0x10(%rcx) add $0x20,%rcx cmp $0x20,%r8 jge L(movdqa13) jmp L(movdqa_epi) .balign 16 L(movdqa14): sub $0x20,%r8 movdqa 0x10(%rdx),%xmm3 movdqa 0x20(%rdx),%xmm0 add $0x20,%rdx psrldq $0xe,%xmm1 movdqa %xmm3,%xmm2 pslldq $0x2,%xmm3 por %xmm1,%xmm3 psrldq $0xe,%xmm2 movdqa %xmm0,%xmm1 pslldq $0x2,%xmm0 por %xmm2,%xmm0 movdqa %xmm3,(%rcx) movdqa %xmm0,0x10(%rcx) add $0x20,%rcx cmp $0x20,%r8 jge L(movdqa14) jmp L(movdqa_epi) .balign 16 L(movdqa15): sub $0x20,%r8 movdqa 0x10(%rdx),%xmm3 movdqa 0x20(%rdx),%xmm0 add $0x20,%rdx psrldq $0xf,%xmm1 movdqa %xmm3,%xmm2 pslldq $0x1,%xmm3 por %xmm1,%xmm3 psrldq $0xf,%xmm2 movdqa %xmm0,%xmm1 pslldq $0x1,%xmm0 por %xmm2,%xmm0 movdqa %xmm3,(%rcx) movdqa %xmm0,0x10(%rcx) add $0x20,%rcx cmp $0x20,%r8 jge L(movdqa15) #jmp L(movdqa_epi) .balign 16 L(movdqa_epi): lea L(fwdPxQx)(%rip),%r10 add %r11,%rdx # bump rdx to the right addr (it lagged behind in the above loop) add %r8,%rcx add %r8,%rdx movslq (%r10,%r8,4),%r9 lea (%r9,%r10,1),%r10 jmpq *%r10 .balign 16 L(mov3dqa1): movdqa 0x10(%rdx),%xmm3 # load the upper source buffer sub $0x30,%r8 movdqa 0x20(%rdx),%xmm0 # load the upper source buffer movdqa 0x30(%rdx),%xmm5 # load the upper source buffer lea 0x30(%rdx),%rdx cmp $0x30,%r8 movdqa %xmm3,%xmm2 # store off xmm reg for use next iteration #palignr $0x1,%xmm1,%xmm3 .byte 0x66,0x0f,0x3a,0x0f .byte 0xd9,0x01 movdqa %xmm3,(%rcx) # store it movdqa %xmm0,%xmm4 # store off xmm reg for use next iteration #palignr $0x1,%xmm2,%xmm0 .byte 0x66,0x0f,0x3a,0x0f .byte 0xc2,0x01 movdqa %xmm0,0x10(%rcx) # store it movdqa %xmm5,%xmm1 # store off xmm reg for use next iteration #palignr $0x1,%xmm4,%xmm5 .byte 0x66,0x0f,0x3a,0x0f .byte 0xec,0x01 movdqa %xmm5,0x20(%rcx) # store it lea 0x30(%rcx),%rcx jge L(mov3dqa1) cmp $0x10,%r8 jl L(movdqa_epi) movdqa 0x10(%rdx),%xmm3 # load the upper source buffer sub $0x10,%r8 lea 0x10(%rdx),%rdx movdqa %xmm3,%xmm2 # save for use next concat #palignr $0x1,%xmm1,%xmm3 .byte 0x66,0x0f,0x3a,0x0f .byte 0xd9,0x01 cmp $0x10,%r8 movdqa %xmm3,(%rcx) # store it lea 0x10(%rcx),%rcx jl L(movdqa_epi) movdqa 0x10(%rdx),%xmm0 # load the upper source buffer sub $0x10,%r8 lea 0x10(%rdx),%rdx #palignr $0x1,%xmm2,%xmm0 .byte 0x66,0x0f,0x3a,0x0f .byte 0xc2,0x01 movdqa %xmm0,(%rcx) # store it lea 0x10(%rcx),%rcx jmp L(movdqa_epi) .balign 16 L(mov3dqa2): movdqa 0x10(%rdx),%xmm3 sub $0x30,%r8 movdqa 0x20(%rdx),%xmm0 movdqa 0x30(%rdx),%xmm5 lea 0x30(%rdx),%rdx cmp $0x30,%r8 movdqa %xmm3,%xmm2 #palignr $0x2,%xmm1,%xmm3 .byte 0x66,0x0f,0x3a,0x0f .byte 0xd9,0x02 movdqa %xmm3,(%rcx) movdqa %xmm0,%xmm4 #palignr $0x2,%xmm2,%xmm0 .byte 0x66,0x0f,0x3a,0x0f .byte 0xc2,0x02 movdqa %xmm0,0x10(%rcx) movdqa %xmm5,%xmm1 #palignr $0x2,%xmm4,%xmm5 .byte 0x66,0x0f,0x3a,0x0f .byte 0xec,0x02 movdqa %xmm5,0x20(%rcx) lea 0x30(%rcx),%rcx jge L(mov3dqa2) cmp $0x10,%r8 jl L(movdqa_epi) movdqa 0x10(%rdx),%xmm3 # load the upper source buffer sub $0x10,%r8 lea 0x10(%rdx),%rdx movdqa %xmm3,%xmm2 # save for use next concat #palignr $0x2,%xmm1,%xmm3 .byte 0x66,0x0f,0x3a,0x0f .byte 0xd9,0x02 cmp $0x10,%r8 movdqa %xmm3,(%rcx) # store it lea 0x10(%rcx),%rcx jl L(movdqa_epi) movdqa 0x10(%rdx),%xmm0 # load the upper source buffer sub $0x10,%r8 lea 0x10(%rdx),%rdx #palignr $0x2,%xmm2,%xmm0 .byte 0x66,0x0f,0x3a,0x0f .byte 0xc2,0x02 movdqa %xmm0,(%rcx) # store it lea 0x10(%rcx),%rcx jmp L(movdqa_epi) .balign 16 L(mov3dqa3): movdqa 0x10(%rdx),%xmm3 sub $0x30,%r8 movdqa 0x20(%rdx),%xmm0 movdqa 0x30(%rdx),%xmm5 lea 0x30(%rdx),%rdx cmp $0x30,%r8 movdqa %xmm3,%xmm2 #palignr $0x3,%xmm1,%xmm3 .byte 0x66,0x0f,0x3a,0x0f .byte 0xd9,0x03 movdqa %xmm3,(%rcx) movdqa %xmm0,%xmm4 #palignr $0x3,%xmm2,%xmm0 .byte 0x66,0x0f,0x3a,0x0f .byte 0xc2,0x03 movdqa %xmm0,0x10(%rcx) movdqa %xmm5,%xmm1 #palignr $0x3,%xmm4,%xmm5 .byte 0x66,0x0f,0x3a,0x0f .byte 0xec,0x03 movdqa %xmm5,0x20(%rcx) lea 0x30(%rcx),%rcx jge L(mov3dqa3) cmp $0x10,%r8 jl L(movdqa_epi) movdqa 0x10(%rdx),%xmm3 # load the upper source buffer sub $0x10,%r8 lea 0x10(%rdx),%rdx movdqa %xmm3,%xmm2 # save for use next concat #palignr $0x3,%xmm1,%xmm3 .byte 0x66,0x0f,0x3a,0x0f .byte 0xd9,0x03 cmp $0x10,%r8 movdqa %xmm3,(%rcx) # store it lea 0x10(%rcx),%rcx jl L(movdqa_epi) movdqa 0x10(%rdx),%xmm0 # load the upper source buffer sub $0x10,%r8 lea 0x10(%rdx),%rdx #palignr $0x3,%xmm2,%xmm0 .byte 0x66,0x0f,0x3a,0x0f .byte 0xc2,0x03 movdqa %xmm0,(%rcx) # store it lea 0x10(%rcx),%rcx jmp L(movdqa_epi) .balign 16 L(mov3dqa4): movdqa 0x10(%rdx),%xmm3 sub $0x30,%r8 movdqa 0x20(%rdx),%xmm0 movdqa 0x30(%rdx),%xmm5 lea 0x30(%rdx),%rdx cmp $0x30,%r8 movdqa %xmm3,%xmm2 #palignr $0x4,%xmm1,%xmm3 .byte 0x66,0x0f,0x3a,0x0f .byte 0xd9,0x04 movdqa %xmm3,(%rcx) movdqa %xmm0,%xmm4 #palignr $0x4,%xmm2,%xmm0 .byte 0x66,0x0f,0x3a,0x0f .byte 0xc2,0x04 movdqa %xmm0,0x10(%rcx) movdqa %xmm5,%xmm1 #palignr $0x4,%xmm4,%xmm5 .byte 0x66,0x0f,0x3a,0x0f .byte 0xec,0x04 movdqa %xmm5,0x20(%rcx) lea 0x30(%rcx),%rcx jge L(mov3dqa4) cmp $0x10,%r8 jl L(movdqa_epi) movdqa 0x10(%rdx),%xmm3 # load the upper source buffer sub $0x10,%r8 lea 0x10(%rdx),%rdx movdqa %xmm3,%xmm2 # save for use next concat #palignr $0x4,%xmm1,%xmm3 .byte 0x66,0x0f,0x3a,0x0f .byte 0xd9,0x04 cmp $0x10,%r8 movdqa %xmm3,(%rcx) # store it lea 0x10(%rcx),%rcx jl L(movdqa_epi) movdqa 0x10(%rdx),%xmm0 # load the upper source buffer sub $0x10,%r8 lea 0x10(%rdx),%rdx #palignr $0x4,%xmm2,%xmm0 .byte 0x66,0x0f,0x3a,0x0f .byte 0xc2,0x04 movdqa %xmm0,(%rcx) # store it lea 0x10(%rcx),%rcx jmp L(movdqa_epi) .balign 16 L(mov3dqa5): movdqa 0x10(%rdx),%xmm3 sub $0x30,%r8 movdqa 0x20(%rdx),%xmm0 movdqa 0x30(%rdx),%xmm5 lea 0x30(%rdx),%rdx cmp $0x30,%r8 movdqa %xmm3,%xmm2 #palignr $0x5,%xmm1,%xmm3 .byte 0x66,0x0f,0x3a,0x0f .byte 0xd9,0x05 movdqa %xmm3,(%rcx) movdqa %xmm0,%xmm4 #palignr $0x5,%xmm2,%xmm0 .byte 0x66,0x0f,0x3a,0x0f .byte 0xc2,0x05 movdqa %xmm0,0x10(%rcx) movdqa %xmm5,%xmm1 #palignr $0x5,%xmm4,%xmm5 .byte 0x66,0x0f,0x3a,0x0f .byte 0xec,0x05 movdqa %xmm5,0x20(%rcx) lea 0x30(%rcx),%rcx jge L(mov3dqa5) cmp $0x10,%r8 jl L(movdqa_epi) movdqa 0x10(%rdx),%xmm3 # load the upper source buffer sub $0x10,%r8 lea 0x10(%rdx),%rdx movdqa %xmm3,%xmm2 # save for use next concat #palignr $0x5,%xmm1,%xmm3 .byte 0x66,0x0f,0x3a,0x0f .byte 0xd9,0x05 cmp $0x10,%r8 movdqa %xmm3,(%rcx) # store it lea 0x10(%rcx),%rcx jl L(movdqa_epi) movdqa 0x10(%rdx),%xmm0 # load the upper source buffer sub $0x10,%r8 lea 0x10(%rdx),%rdx #palignr $0x5,%xmm2,%xmm0 .byte 0x66,0x0f,0x3a,0x0f .byte 0xc2,0x05 movdqa %xmm0,(%rcx) # store it lea 0x10(%rcx),%rcx jmp L(movdqa_epi) .balign 16 L(mov3dqa6): movdqa 0x10(%rdx),%xmm3 sub $0x30,%r8 movdqa 0x20(%rdx),%xmm0 movdqa 0x30(%rdx),%xmm5 lea 0x30(%rdx),%rdx cmp $0x30,%r8 movdqa %xmm3,%xmm2 #palignr $0x6,%xmm1,%xmm3 .byte 0x66,0x0f,0x3a,0x0f .byte 0xd9,0x06 movdqa %xmm3,(%rcx) movdqa %xmm0,%xmm4 #palignr $0x6,%xmm2,%xmm0 .byte 0x66,0x0f,0x3a,0x0f .byte 0xc2,0x06 movdqa %xmm0,0x10(%rcx) movdqa %xmm5,%xmm1 #palignr $0x6,%xmm4,%xmm5 .byte 0x66,0x0f,0x3a,0x0f .byte 0xec,0x06 movdqa %xmm5,0x20(%rcx) lea 0x30(%rcx),%rcx jge L(mov3dqa6) cmp $0x10,%r8 jl L(movdqa_epi) movdqa 0x10(%rdx),%xmm3 # load the upper source buffer sub $0x10,%r8 lea 0x10(%rdx),%rdx movdqa %xmm3,%xmm2 # save for use next concat #palignr $0x6,%xmm1,%xmm3 .byte 0x66,0x0f,0x3a,0x0f .byte 0xd9,0x06 cmp $0x10,%r8 movdqa %xmm3,(%rcx) # store it lea 0x10(%rcx),%rcx jl L(movdqa_epi) movdqa 0x10(%rdx),%xmm0 # load the upper source buffer sub $0x10,%r8 lea 0x10(%rdx),%rdx #palignr $0x6,%xmm2,%xmm0 .byte 0x66,0x0f,0x3a,0x0f .byte 0xc2,0x06 movdqa %xmm0,(%rcx) # store it lea 0x10(%rcx),%rcx jmp L(movdqa_epi) .balign 16 L(mov3dqa7): movdqa 0x10(%rdx),%xmm3 sub $0x30,%r8 movdqa 0x20(%rdx),%xmm0 movdqa 0x30(%rdx),%xmm5 lea 0x30(%rdx),%rdx cmp $0x30,%r8 movdqa %xmm3,%xmm2 #palignr $0x7,%xmm1,%xmm3 .byte 0x66,0x0f,0x3a,0x0f .byte 0xd9,0x07 movdqa %xmm3,(%rcx) movdqa %xmm0,%xmm4 #palignr $0x7,%xmm2,%xmm0 .byte 0x66,0x0f,0x3a,0x0f .byte 0xc2,0x07 movdqa %xmm0,0x10(%rcx) movdqa %xmm5,%xmm1 #palignr $0x7,%xmm4,%xmm5 .byte 0x66,0x0f,0x3a,0x0f .byte 0xec,0x07 movdqa %xmm5,0x20(%rcx) lea 0x30(%rcx),%rcx jge L(mov3dqa7) cmp $0x10,%r8 jl L(movdqa_epi) movdqa 0x10(%rdx),%xmm3 # load the upper source buffer sub $0x10,%r8 lea 0x10(%rdx),%rdx movdqa %xmm3,%xmm2 # save for use next concat #palignr $0x7,%xmm1,%xmm3 .byte 0x66,0x0f,0x3a,0x0f .byte 0xd9,0x07 cmp $0x10,%r8 movdqa %xmm3,(%rcx) # store it lea 0x10(%rcx),%rcx jl L(movdqa_epi) movdqa 0x10(%rdx),%xmm0 # load the upper source buffer sub $0x10,%r8 lea 0x10(%rdx),%rdx #palignr $0x7,%xmm2,%xmm0 .byte 0x66,0x0f,0x3a,0x0f .byte 0xc2,0x07 movdqa %xmm0,(%rcx) # store it lea 0x10(%rcx),%rcx jmp L(movdqa_epi) .balign 16 L(mov3dqa9): movdqa 0x10(%rdx),%xmm3 sub $0x30,%r8 movdqa 0x20(%rdx),%xmm0 movdqa 0x30(%rdx),%xmm5 lea 0x30(%rdx),%rdx cmp $0x30,%r8 movdqa %xmm3,%xmm2 #palignr $0x9,%xmm1,%xmm3 .byte 0x66,0x0f,0x3a,0x0f .byte 0xd9,0x09 movdqa %xmm3,(%rcx) movdqa %xmm0,%xmm4 #palignr $0x9,%xmm2,%xmm0 .byte 0x66,0x0f,0x3a,0x0f .byte 0xc2,0x09 movdqa %xmm0,0x10(%rcx) movdqa %xmm5,%xmm1 #palignr $0x9,%xmm4,%xmm5 .byte 0x66,0x0f,0x3a,0x0f .byte 0xec,0x09 movdqa %xmm5,0x20(%rcx) lea 0x30(%rcx),%rcx jge L(mov3dqa9) cmp $0x10,%r8 jl L(movdqa_epi) movdqa 0x10(%rdx),%xmm3 # load the upper source buffer sub $0x10,%r8 lea 0x10(%rdx),%rdx movdqa %xmm3,%xmm2 # save for use next concat #palignr $0x9,%xmm1,%xmm3 .byte 0x66,0x0f,0x3a,0x0f .byte 0xd9,0x09 cmp $0x10,%r8 movdqa %xmm3,(%rcx) # store it lea 0x10(%rcx),%rcx jl L(movdqa_epi) movdqa 0x10(%rdx),%xmm0 # load the upper source buffer sub $0x10,%r8 lea 0x10(%rdx),%rdx #palignr $0x9,%xmm2,%xmm0 .byte 0x66,0x0f,0x3a,0x0f .byte 0xc2,0x09 movdqa %xmm0,(%rcx) # store it lea 0x10(%rcx),%rcx jmp L(movdqa_epi) .balign 16 L(mov3dqa10): movdqa 0x10(%rdx),%xmm3 sub $0x30,%r8 movdqa 0x20(%rdx),%xmm0 movdqa 0x30(%rdx),%xmm5 lea 0x30(%rdx),%rdx cmp $0x30,%r8 movdqa %xmm3,%xmm2 #palignr $0xa,%xmm1,%xmm3 .byte 0x66,0x0f,0x3a,0x0f .byte 0xd9,0x0a movdqa %xmm3,(%rcx) movdqa %xmm0,%xmm4 #palignr $0xa,%xmm2,%xmm0 .byte 0x66,0x0f,0x3a,0x0f .byte 0xc2,0x0a movdqa %xmm0,0x10(%rcx) movdqa %xmm5,%xmm1 #palignr $0xa,%xmm4,%xmm5 .byte 0x66,0x0f,0x3a,0x0f .byte 0xec,0x0a movdqa %xmm5,0x20(%rcx) lea 0x30(%rcx),%rcx jge L(mov3dqa10) cmp $0x10,%r8 jl L(movdqa_epi) movdqa 0x10(%rdx),%xmm3 # load the upper source buffer sub $0x10,%r8 lea 0x10(%rdx),%rdx movdqa %xmm3,%xmm2 # save for use next concat #palignr $0xa,%xmm1,%xmm3 .byte 0x66,0x0f,0x3a,0x0f .byte 0xd9,0x0a cmp $0x10,%r8 movdqa %xmm3,(%rcx) # store it lea 0x10(%rcx),%rcx jl L(movdqa_epi) movdqa 0x10(%rdx),%xmm0 # load the upper source buffer sub $0x10,%r8 lea 0x10(%rdx),%rdx #palignr $0xa,%xmm2,%xmm0 .byte 0x66,0x0f,0x3a,0x0f .byte 0xc2,0x0a movdqa %xmm0,(%rcx) # store it lea 0x10(%rcx),%rcx jmp L(movdqa_epi) .balign 16 L(mov3dqa11): movdqa 0x10(%rdx),%xmm3 sub $0x30,%r8 movdqa 0x20(%rdx),%xmm0 movdqa 0x30(%rdx),%xmm5 lea 0x30(%rdx),%rdx cmp $0x30,%r8 movdqa %xmm3,%xmm2 #palignr $0xb,%xmm1,%xmm3 .byte 0x66,0x0f,0x3a,0x0f .byte 0xd9,0x0b movdqa %xmm3,(%rcx) movdqa %xmm0,%xmm4 #palignr $0xb,%xmm2,%xmm0 .byte 0x66,0x0f,0x3a,0x0f .byte 0xc2,0x0b movdqa %xmm0,0x10(%rcx) movdqa %xmm5,%xmm1 #palignr $0xb,%xmm4,%xmm5 .byte 0x66,0x0f,0x3a,0x0f .byte 0xec,0x0b movdqa %xmm5,0x20(%rcx) lea 0x30(%rcx),%rcx jge L(mov3dqa11) cmp $0x10,%r8 jl L(movdqa_epi) movdqa 0x10(%rdx),%xmm3 # load the upper source buffer sub $0x10,%r8 lea 0x10(%rdx),%rdx movdqa %xmm3,%xmm2 # save for use next concat #palignr $0xb,%xmm1,%xmm3 .byte 0x66,0x0f,0x3a,0x0f .byte 0xd9,0x0b cmp $0x10,%r8 movdqa %xmm3,(%rcx) # store it lea 0x10(%rcx),%rcx jl L(movdqa_epi) movdqa 0x10(%rdx),%xmm0 # load the upper source buffer sub $0x10,%r8 lea 0x10(%rdx),%rdx #palignr $0xb,%xmm2,%xmm0 .byte 0x66,0x0f,0x3a,0x0f .byte 0xc2,0x0b movdqa %xmm0,(%rcx) # store it lea 0x10(%rcx),%rcx jmp L(movdqa_epi) .balign 16 L(mov3dqa12): movdqa 0x10(%rdx),%xmm3 sub $0x30,%r8 movdqa 0x20(%rdx),%xmm0 movdqa 0x30(%rdx),%xmm5 lea 0x30(%rdx),%rdx cmp $0x30,%r8 movdqa %xmm3,%xmm2 #palignr $0xc,%xmm1,%xmm3 .byte 0x66,0x0f,0x3a,0x0f .byte 0xd9,0x0c movdqa %xmm3,(%rcx) movdqa %xmm0,%xmm4 #palignr $0xc,%xmm2,%xmm0 .byte 0x66,0x0f,0x3a,0x0f .byte 0xc2,0x0c movdqa %xmm0,0x10(%rcx) movdqa %xmm5,%xmm1 #palignr $0xc,%xmm4,%xmm5 .byte 0x66,0x0f,0x3a,0x0f .byte 0xec,0x0c movdqa %xmm5,0x20(%rcx) lea 0x30(%rcx),%rcx jge L(mov3dqa12) cmp $0x10,%r8 jl L(movdqa_epi) movdqa 0x10(%rdx),%xmm3 # load the upper source buffer sub $0x10,%r8 lea 0x10(%rdx),%rdx movdqa %xmm3,%xmm2 # save for use next concat #palignr $0xc,%xmm1,%xmm3 .byte 0x66,0x0f,0x3a,0x0f .byte 0xd9,0x0c cmp $0x10,%r8 movdqa %xmm3,(%rcx) # store it lea 0x10(%rcx),%rcx jl L(movdqa_epi) movdqa 0x10(%rdx),%xmm0 # load the upper source buffer sub $0x10,%r8 lea 0x10(%rdx),%rdx #palignr $0xc,%xmm2,%xmm0 .byte 0x66,0x0f,0x3a,0x0f .byte 0xc2,0x0c movdqa %xmm0,(%rcx) # store it lea 0x10(%rcx),%rcx jmp L(movdqa_epi) .balign 16 L(mov3dqa13): movdqa 0x10(%rdx),%xmm3 sub $0x30,%r8 movdqa 0x20(%rdx),%xmm0 movdqa 0x30(%rdx),%xmm5 lea 0x30(%rdx),%rdx cmp $0x30,%r8 movdqa %xmm3,%xmm2 #palignr $0xd,%xmm1,%xmm3 .byte 0x66,0x0f,0x3a,0x0f .byte 0xd9,0x0d movdqa %xmm3,(%rcx) movdqa %xmm0,%xmm4 #palignr $0xd,%xmm2,%xmm0 .byte 0x66,0x0f,0x3a,0x0f .byte 0xc2,0x0d movdqa %xmm0,0x10(%rcx) movdqa %xmm5,%xmm1 #palignr $0xd,%xmm4,%xmm5 .byte 0x66,0x0f,0x3a,0x0f .byte 0xec,0x0d movdqa %xmm5,0x20(%rcx) lea 0x30(%rcx),%rcx jge L(mov3dqa13) cmp $0x10,%r8 jl L(movdqa_epi) movdqa 0x10(%rdx),%xmm3 # load the upper source buffer sub $0x10,%r8 lea 0x10(%rdx),%rdx movdqa %xmm3,%xmm2 # save for use next concat #palignr $0xd,%xmm1,%xmm3 .byte 0x66,0x0f,0x3a,0x0f .byte 0xd9,0x0d cmp $0x10,%r8 movdqa %xmm3,(%rcx) # store it lea 0x10(%rcx),%rcx jl L(movdqa_epi) movdqa 0x10(%rdx),%xmm0 # load the upper source buffer sub $0x10,%r8 lea 0x10(%rdx),%rdx #palignr $0xd,%xmm2,%xmm0 .byte 0x66,0x0f,0x3a,0x0f .byte 0xc2,0x0d movdqa %xmm0,(%rcx) # store it lea 0x10(%rcx),%rcx jmp L(movdqa_epi) .balign 16 L(mov3dqa14): movdqa 0x10(%rdx),%xmm3 sub $0x30,%r8 movdqa 0x20(%rdx),%xmm0 movdqa 0x30(%rdx),%xmm5 lea 0x30(%rdx),%rdx cmp $0x30,%r8 movdqa %xmm3,%xmm2 #palignr $0xe,%xmm1,%xmm3 .byte 0x66,0x0f,0x3a,0x0f .byte 0xd9,0x0e movdqa %xmm3,(%rcx) movdqa %xmm0,%xmm4 #palignr $0xe,%xmm2,%xmm0 .byte 0x66,0x0f,0x3a,0x0f .byte 0xc2,0x0e movdqa %xmm0,0x10(%rcx) movdqa %xmm5,%xmm1 #palignr $0xe,%xmm4,%xmm5 .byte 0x66,0x0f,0x3a,0x0f .byte 0xec,0x0e movdqa %xmm5,0x20(%rcx) lea 0x30(%rcx),%rcx jge L(mov3dqa14) cmp $0x10,%r8 jl L(movdqa_epi) movdqa 0x10(%rdx),%xmm3 # load the upper source buffer sub $0x10,%r8 lea 0x10(%rdx),%rdx movdqa %xmm3,%xmm2 # save for use next concat #palignr $0xe,%xmm1,%xmm3 .byte 0x66,0x0f,0x3a,0x0f .byte 0xd9,0x0e cmp $0x10,%r8 movdqa %xmm3,(%rcx) # store it lea 0x10(%rcx),%rcx jl L(movdqa_epi) movdqa 0x10(%rdx),%xmm0 # load the upper source buffer sub $0x10,%r8 lea 0x10(%rdx),%rdx #palignr $0xe,%xmm2,%xmm0 .byte 0x66,0x0f,0x3a,0x0f .byte 0xc2,0x0e movdqa %xmm0,(%rcx) # store it lea 0x10(%rcx),%rcx jmp L(movdqa_epi) .balign 16 L(mov3dqa15): movdqa 0x10(%rdx),%xmm3 sub $0x30,%r8 movdqa 0x20(%rdx),%xmm0 movdqa 0x30(%rdx),%xmm5 lea 0x30(%rdx),%rdx cmp $0x30,%r8 movdqa %xmm3,%xmm2 #palignr $0xf,%xmm1,%xmm3 .byte 0x66,0x0f,0x3a,0x0f .byte 0xd9,0x0f movdqa %xmm3,(%rcx) movdqa %xmm0,%xmm4 #palignr $0xf,%xmm2,%xmm0 .byte 0x66,0x0f,0x3a,0x0f .byte 0xc2,0x0f movdqa %xmm0,0x10(%rcx) movdqa %xmm5,%xmm1 #palignr $0xf,%xmm4,%xmm5 .byte 0x66,0x0f,0x3a,0x0f .byte 0xec,0x0f movdqa %xmm5,0x20(%rcx) lea 0x30(%rcx),%rcx jge L(mov3dqa15) cmp $0x10,%r8 jl L(movdqa_epi) movdqa 0x10(%rdx),%xmm3 # load the upper source buffer sub $0x10,%r8 lea 0x10(%rdx),%rdx movdqa %xmm3,%xmm2 # save for use next concat #palignr $0xf,%xmm1,%xmm3 .byte 0x66,0x0f,0x3a,0x0f .byte 0xd9,0x0f cmp $0x10,%r8 movdqa %xmm3,(%rcx) # store it lea 0x10(%rcx),%rcx jl L(movdqa_epi) movdqa 0x10(%rdx),%xmm0 # load the upper source buffer sub $0x10,%r8 lea 0x10(%rdx),%rdx #palignr $0xf,%xmm2,%xmm0 .byte 0x66,0x0f,0x3a,0x0f .byte 0xc2,0x0f movdqa %xmm0,(%rcx) # store it lea 0x10(%rcx),%rcx jmp L(movdqa_epi) .balign 16 L(sse2_nt_move): lea 0x40(%rcx),%rcx lea 0x40(%rdx),%rdx lea -0x40(%r8),%r8 /* * doesn't matter if source is aligned for stuff out of cache. * the mis-aligned penalty is masked by the slowness of main memory. */ prefetchnta 0x180(%rdx) movdqu -0x40(%rdx),%xmm0 movdqu -0x30(%rdx),%xmm1 cmp $0x40,%r8 movntdq %xmm0,-0x40(%rcx) movntdq %xmm1,-0x30(%rcx) movdqu -0x20(%rdx),%xmm2 movdqu -0x10(%rdx),%xmm3 movntdq %xmm2,-0x20(%rcx) movntdq %xmm3,-0x10(%rcx) jge L(sse2_nt_move) lea L(Fix16EndTable)(%rip),%r10 mov %r8,%r9 and $0xFFFFFFFFFFFFFFF0,%r9 add %r9,%rcx add %r9,%rdx sub %r9,%r8 shr $0x4,%r9 sfence movslq (%r10,%r9,4),%r11 lea (%r11,%r10,1),%r10 jmpq *%r10 .balign 16 L(Fix16EndTable): .int L(fix16_0)-L(Fix16EndTable) .int L(fix16_1)-L(Fix16EndTable) .int L(fix16_2)-L(Fix16EndTable) .int L(fix16_3)-L(Fix16EndTable) .balign 16 L(fix16_3): movdqu -0x30(%rdx),%xmm1 movdqa %xmm1,-0x30(%rcx) L(fix16_2): movdqu -0x20(%rdx),%xmm2 movdqa %xmm2,-0x20(%rcx) L(fix16_1): movdqu -0x10(%rdx),%xmm3 movdqa %xmm3,-0x10(%rcx) L(fix16_0): lea L(fwdPxQx)(%rip),%r10 add %r8,%rdx add %r8,%rcx movslq (%r10,%r8,4),%r9 lea (%r9,%r10,1),%r10 jmpq *%r10 .balign 16 L(pre_both_aligned): cmp $0x80,%r8 jl L(fix_16b) .balign 16 L(both_aligned): /* * this 'paired' load/load/store/store seems to do best. */ movdqa (%rdx),%xmm0 movdqa 0x10(%rdx),%xmm1 movdqa %xmm0,(%rcx) movdqa %xmm1,0x10(%rcx) lea -0x80(%r8),%r8 movdqa 0x20(%rdx),%xmm2 movdqa 0x30(%rdx),%xmm3 movdqa %xmm2,0x20(%rcx) movdqa %xmm3,0x30(%rcx) movdqa 0x40(%rdx),%xmm0 movdqa 0x50(%rdx),%xmm1 cmp $0x80,%r8 movdqa %xmm0,0x40(%rcx) movdqa %xmm1,0x50(%rcx) movdqa 0x60(%rdx),%xmm2 movdqa 0x70(%rdx),%xmm3 lea 0x80(%rdx),%rdx movdqa %xmm2,0x60(%rcx) movdqa %xmm3,0x70(%rcx) lea 0x80(%rcx),%rcx jge L(both_aligned) L(fix_16b): add %r8,%rcx lea L(fwdPxQx)(%rip),%r10 add %r8,%rdx movslq (%r10,%r8,4),%r9 lea (%r9,%r10,1),%r10 jmpq *%r10 .balign 16 L(Loop8byte_pre): # Use 8-byte moves mov .largest_level_cache_size(%rip),%r9d shr %r9 # take half of it cmp %r9,%r8 jge L(byte8_nt_top) # Find out whether to use rep movsq cmp $4096,%r8 jle L(byte8_top) mov .amd64cache1half(%rip),%r9d # half of l1 cache cmp %r9,%r8 jle L(use_rep) .balign 16 L(byte8_top): mov (%rdx),%r9 mov 0x8(%rdx),%r10 lea -0x40(%r8),%r8 mov %r9,(%rcx) mov %r10,0x8(%rcx) mov 0x10(%rdx),%r11 mov 0x18(%rdx),%r9 mov %r11,0x10(%rcx) mov %r9,0x18(%rcx) cmp $0x40,%r8 mov 0x20(%rdx),%r10 mov 0x28(%rdx),%r11 mov %r10,0x20(%rcx) mov %r11,0x28(%rcx) mov 0x30(%rdx),%r9 mov 0x38(%rdx),%r10 lea 0x40(%rdx),%rdx mov %r9,0x30(%rcx) mov %r10,0x38(%rcx) lea 0x40(%rcx),%rcx jg L(byte8_top) L(byte8_end): lea L(fwdPxQx)(%rip),%r10 lea (%rdx,%r8,1),%rdx lea (%rcx,%r8,1),%rcx movslq (%r10,%r8,4),%r9 lea (%r9,%r10,1),%r10 jmpq *%r10 .balign 16 L(use_rep): mov %rdx,%rsi # %rsi = source mov %rcx,%rdi # %rdi = destination mov %r8,%rcx # %rcx = count shrq $3,%rcx # 8-byte word count rep movsq mov %rsi,%rdx # source mov %rdi,%rcx # destination andq $7,%r8 # remainder jnz L(byte8_end) ret .balign 16 L(byte8_nt_top): sub $0x40,%r8 prefetchnta 0x180(%rdx) mov (%rdx),%r9 movnti %r9,(%rcx) mov 0x8(%rdx),%r10 movnti %r10,0x8(%rcx) mov 0x10(%rdx),%r11 movnti %r11,0x10(%rcx) mov 0x18(%rdx),%r9 movnti %r9,0x18(%rcx) mov 0x20(%rdx),%r10 movnti %r10,0x20(%rcx) mov 0x28(%rdx),%r11 movnti %r11,0x28(%rcx) mov 0x30(%rdx),%r9 movnti %r9,0x30(%rcx) mov 0x38(%rdx),%r10 movnti %r10,0x38(%rcx) lea 0x40(%rdx),%rdx lea 0x40(%rcx),%rcx cmp $0x40,%r8 jge L(byte8_nt_top) sfence jmp L(byte8_end) SET_SIZE(memcpy) .balign 16 L(CopyBackwards): mov %rdx,%r8 mov %rdi,%rcx mov %rsi,%rdx mov %rdi,%rax # return value # ck alignment of last byte lea (%rcx,%r8,1),%rcx test $0x7,%rcx lea (%rdx,%r8,1),%rdx jne L(bk_align) L(bk_qw_aligned): lea L(bkPxQx)(%rip),%r10 cmp $0x90,%r8 # 144 jg L(bk_ck_sse2_alignment) sub %r8,%rcx sub %r8,%rdx movslq (%r10,%r8,4),%r9 lea (%r9,%r10,1),%r10 jmpq *%r10 .balign 16 L(bk_align): # only align if len > 8 cmp $8,%r8 jle L(bk_qw_aligned) test $0x1,%rcx je L(bk_tst2) dec %rcx dec %rdx dec %r8 mov (%rdx),%r9b mov %r9b,(%rcx) L(bk_tst2): test $0x2,%rcx je L(bk_tst3) L(bk_got2): sub $0x2,%rcx sub $0x2,%rdx sub $0x2,%r8 movzwq (%rdx),%r9 mov %r9w,(%rcx) L(bk_tst3): test $0x4,%rcx je L(bk_qw_aligned) L(bk_got3): sub $0x4,%rcx sub $0x4,%rdx sub $0x4,%r8 mov (%rdx),%r9d mov %r9d,(%rcx) jmp L(bk_qw_aligned) .balign 16 L(bk_ck_sse2_alignment): cmpl $NO_SSE,.memops_method(%rip) je L(bk_use_rep) # check alignment of last byte test $0xf,%rcx jz L(bk_sse2_cpy) L(bk_sse2_align): # only here if already aligned on at least a qword bndry sub $0x8,%rcx sub $0x8,%rdx sub $0x8,%r8 mov (%rdx),%r9 mov %r9,(%rcx) #jmp L(bk_sse2_cpy) .balign 16 L(bk_sse2_cpy): sub $0x80,%rcx # 128 sub $0x80,%rdx movdqu 0x70(%rdx),%xmm3 movdqu 0x60(%rdx),%xmm2 movdqa %xmm3,0x70(%rcx) movdqa %xmm2,0x60(%rcx) sub $0x80,%r8 movdqu 0x50(%rdx),%xmm1 movdqu 0x40(%rdx),%xmm0 movdqa %xmm1,0x50(%rcx) movdqa %xmm0,0x40(%rcx) cmp $0x80,%r8 movdqu 0x30(%rdx),%xmm3 movdqu 0x20(%rdx),%xmm2 movdqa %xmm3,0x30(%rcx) movdqa %xmm2,0x20(%rcx) movdqu 0x10(%rdx),%xmm1 movdqu (%rdx),%xmm0 movdqa %xmm1,0x10(%rcx) movdqa %xmm0,(%rcx) jge L(bk_sse2_cpy) L(bk_sse2_cpy_end): lea L(bkPxQx)(%rip),%r10 sub %r8,%rdx sub %r8,%rcx movslq (%r10,%r8,4),%r9 lea (%r9,%r10,1),%r10 jmpq *%r10 .balign 16 L(bk_use_rep): xchg %rcx,%r9 mov %rdx,%rsi # source mov %r9,%rdi # destination mov %r8,%rcx # count sub $8,%rsi sub $8,%rdi shr $3,%rcx std # reverse direction rep movsq cld # reset direction flag xchg %rcx,%r9 lea L(bkPxQx)(%rip),%r10 sub %r8,%rdx sub %r8,%rcx andq $7,%r8 # remainder jz 2f movslq (%r10,%r8,4),%r9 lea (%r9,%r10,1),%r10 jmpq *%r10 2: ret .balign 16 L(bkP0QI): mov 0x88(%rdx),%r10 mov %r10,0x88(%rcx) L(bkP0QH): mov 0x80(%rdx),%r10 mov %r10,0x80(%rcx) L(bkP0QG): mov 0x78(%rdx),%r9 mov %r9,0x78(%rcx) L(bkP0QF): mov 0x70(%rdx),%r11 mov %r11,0x70(%rcx) L(bkP0QE): mov 0x68(%rdx),%r10 mov %r10,0x68(%rcx) L(bkP0QD): mov 0x60(%rdx),%r9 mov %r9,0x60(%rcx) L(bkP0QC): mov 0x58(%rdx),%r11 mov %r11,0x58(%rcx) L(bkP0QB): mov 0x50(%rdx),%r10 mov %r10,0x50(%rcx) L(bkP0QA): mov 0x48(%rdx),%r9 mov %r9,0x48(%rcx) L(bkP0Q9): mov 0x40(%rdx),%r11 mov %r11,0x40(%rcx) L(bkP0Q8): mov 0x38(%rdx),%r10 mov %r10,0x38(%rcx) L(bkP0Q7): mov 0x30(%rdx),%r9 mov %r9,0x30(%rcx) L(bkP0Q6): mov 0x28(%rdx),%r11 mov %r11,0x28(%rcx) L(bkP0Q5): mov 0x20(%rdx),%r10 mov %r10,0x20(%rcx) L(bkP0Q4): mov 0x18(%rdx),%r9 mov %r9,0x18(%rcx) L(bkP0Q3): mov 0x10(%rdx),%r11 mov %r11,0x10(%rcx) L(bkP0Q2): mov 0x8(%rdx),%r10 mov %r10,0x8(%rcx) L(bkP0Q1): mov (%rdx),%r9 mov %r9,(%rcx) L(bkP0Q0): ret .balign 16 L(bkP1QI): mov 0x89(%rdx),%r10 mov %r10,0x89(%rcx) L(bkP1QH): mov 0x81(%rdx),%r11 mov %r11,0x81(%rcx) L(bkP1QG): mov 0x79(%rdx),%r10 mov %r10,0x79(%rcx) L(bkP1QF): mov 0x71(%rdx),%r9 mov %r9,0x71(%rcx) L(bkP1QE): mov 0x69(%rdx),%r11 mov %r11,0x69(%rcx) L(bkP1QD): mov 0x61(%rdx),%r10 mov %r10,0x61(%rcx) L(bkP1QC): mov 0x59(%rdx),%r9 mov %r9,0x59(%rcx) L(bkP1QB): mov 0x51(%rdx),%r11 mov %r11,0x51(%rcx) L(bkP1QA): mov 0x49(%rdx),%r10 mov %r10,0x49(%rcx) L(bkP1Q9): mov 0x41(%rdx),%r9 mov %r9,0x41(%rcx) L(bkP1Q8): mov 0x39(%rdx),%r11 mov %r11,0x39(%rcx) L(bkP1Q7): mov 0x31(%rdx),%r10 mov %r10,0x31(%rcx) L(bkP1Q6): mov 0x29(%rdx),%r9 mov %r9,0x29(%rcx) L(bkP1Q5): mov 0x21(%rdx),%r11 mov %r11,0x21(%rcx) L(bkP1Q4): mov 0x19(%rdx),%r10 mov %r10,0x19(%rcx) L(bkP1Q3): mov 0x11(%rdx),%r9 mov %r9,0x11(%rcx) L(bkP1Q2): mov 0x9(%rdx),%r11 mov %r11,0x9(%rcx) L(bkP1Q1): mov 0x1(%rdx),%r10 mov %r10,0x1(%rcx) L(bkP1Q0): mov (%rdx),%r9b mov %r9b,(%rcx) ret .balign 16 L(bkP2QI): mov 0x8a(%rdx),%r10 mov %r10,0x8a(%rcx) L(bkP2QH): mov 0x82(%rdx),%r11 mov %r11,0x82(%rcx) L(bkP2QG): mov 0x7a(%rdx),%r10 mov %r10,0x7a(%rcx) L(bkP2QF): mov 0x72(%rdx),%r9 mov %r9,0x72(%rcx) L(bkP2QE): mov 0x6a(%rdx),%r11 mov %r11,0x6a(%rcx) L(bkP2QD): mov 0x62(%rdx),%r10 mov %r10,0x62(%rcx) L(bkP2QC): mov 0x5a(%rdx),%r9 mov %r9,0x5a(%rcx) L(bkP2QB): mov 0x52(%rdx),%r11 mov %r11,0x52(%rcx) L(bkP2QA): mov 0x4a(%rdx),%r10 mov %r10,0x4a(%rcx) L(bkP2Q9): mov 0x42(%rdx),%r9 mov %r9,0x42(%rcx) L(bkP2Q8): mov 0x3a(%rdx),%r11 mov %r11,0x3a(%rcx) L(bkP2Q7): mov 0x32(%rdx),%r10 mov %r10,0x32(%rcx) L(bkP2Q6): mov 0x2a(%rdx),%r9 mov %r9,0x2a(%rcx) L(bkP2Q5): mov 0x22(%rdx),%r11 mov %r11,0x22(%rcx) L(bkP2Q4): mov 0x1a(%rdx),%r10 mov %r10,0x1a(%rcx) L(bkP2Q3): mov 0x12(%rdx),%r9 mov %r9,0x12(%rcx) L(bkP2Q2): mov 0xa(%rdx),%r11 mov %r11,0xa(%rcx) L(bkP2Q1): mov 0x2(%rdx),%r10 mov %r10,0x2(%rcx) L(bkP2Q0): mov (%rdx),%r9w mov %r9w,(%rcx) ret .balign 16 L(bkP3QI): mov 0x8b(%rdx),%r10 mov %r10,0x8b(%rcx) L(bkP3QH): mov 0x83(%rdx),%r11 mov %r11,0x83(%rcx) L(bkP3QG): mov 0x7b(%rdx),%r10 mov %r10,0x7b(%rcx) L(bkP3QF): mov 0x73(%rdx),%r9 mov %r9,0x73(%rcx) L(bkP3QE): mov 0x6b(%rdx),%r11 mov %r11,0x6b(%rcx) L(bkP3QD): mov 0x63(%rdx),%r10 mov %r10,0x63(%rcx) L(bkP3QC): mov 0x5b(%rdx),%r9 mov %r9,0x5b(%rcx) L(bkP3QB): mov 0x53(%rdx),%r11 mov %r11,0x53(%rcx) L(bkP3QA): mov 0x4b(%rdx),%r10 mov %r10,0x4b(%rcx) L(bkP3Q9): mov 0x43(%rdx),%r9 mov %r9,0x43(%rcx) L(bkP3Q8): mov 0x3b(%rdx),%r11 mov %r11,0x3b(%rcx) L(bkP3Q7): mov 0x33(%rdx),%r10 mov %r10,0x33(%rcx) L(bkP3Q6): mov 0x2b(%rdx),%r9 mov %r9,0x2b(%rcx) L(bkP3Q5): mov 0x23(%rdx),%r11 mov %r11,0x23(%rcx) L(bkP3Q4): mov 0x1b(%rdx),%r10 mov %r10,0x1b(%rcx) L(bkP3Q3): mov 0x13(%rdx),%r9 mov %r9,0x13(%rcx) L(bkP3Q2): mov 0xb(%rdx),%r11 mov %r11,0xb(%rcx) L(bkP3Q1): mov 0x3(%rdx),%r10 mov %r10,0x3(%rcx) L(bkP3Q0): # trailing loads/stores do all their loads 1st, then do the stores mov 0x1(%rdx),%r9w mov %r9w,0x1(%rcx) mov (%rdx),%r10b mov %r10b,(%rcx) ret .balign 16 L(bkP4QI): mov 0x8c(%rdx),%r10 mov %r10,0x8c(%rcx) L(bkP4QH): mov 0x84(%rdx),%r11 mov %r11,0x84(%rcx) L(bkP4QG): mov 0x7c(%rdx),%r10 mov %r10,0x7c(%rcx) L(bkP4QF): mov 0x74(%rdx),%r9 mov %r9,0x74(%rcx) L(bkP4QE): mov 0x6c(%rdx),%r11 mov %r11,0x6c(%rcx) L(bkP4QD): mov 0x64(%rdx),%r10 mov %r10,0x64(%rcx) L(bkP4QC): mov 0x5c(%rdx),%r9 mov %r9,0x5c(%rcx) L(bkP4QB): mov 0x54(%rdx),%r11 mov %r11,0x54(%rcx) L(bkP4QA): mov 0x4c(%rdx),%r10 mov %r10,0x4c(%rcx) L(bkP4Q9): mov 0x44(%rdx),%r9 mov %r9,0x44(%rcx) L(bkP4Q8): mov 0x3c(%rdx),%r11 mov %r11,0x3c(%rcx) L(bkP4Q7): mov 0x34(%rdx),%r10 mov %r10,0x34(%rcx) L(bkP4Q6): mov 0x2c(%rdx),%r9 mov %r9,0x2c(%rcx) L(bkP4Q5): mov 0x24(%rdx),%r11 mov %r11,0x24(%rcx) L(bkP4Q4): mov 0x1c(%rdx),%r10 mov %r10,0x1c(%rcx) L(bkP4Q3): mov 0x14(%rdx),%r9 mov %r9,0x14(%rcx) L(bkP4Q2): mov 0xc(%rdx),%r11 mov %r11,0xc(%rcx) L(bkP4Q1): mov 0x4(%rdx),%r10 mov %r10,0x4(%rcx) L(bkP4Q0): mov (%rdx),%r9d mov %r9d,(%rcx) ret .balign 16 L(bkP5QI): mov 0x8d(%rdx),%r10 mov %r10,0x8d(%rcx) L(bkP5QH): mov 0x85(%rdx),%r9 mov %r9,0x85(%rcx) L(bkP5QG): mov 0x7d(%rdx),%r11 mov %r11,0x7d(%rcx) L(bkP5QF): mov 0x75(%rdx),%r10 mov %r10,0x75(%rcx) L(bkP5QE): mov 0x6d(%rdx),%r9 mov %r9,0x6d(%rcx) L(bkP5QD): mov 0x65(%rdx),%r11 mov %r11,0x65(%rcx) L(bkP5QC): mov 0x5d(%rdx),%r10 mov %r10,0x5d(%rcx) L(bkP5QB): mov 0x55(%rdx),%r9 mov %r9,0x55(%rcx) L(bkP5QA): mov 0x4d(%rdx),%r11 mov %r11,0x4d(%rcx) L(bkP5Q9): mov 0x45(%rdx),%r10 mov %r10,0x45(%rcx) L(bkP5Q8): mov 0x3d(%rdx),%r9 mov %r9,0x3d(%rcx) L(bkP5Q7): mov 0x35(%rdx),%r11 mov %r11,0x35(%rcx) L(bkP5Q6): mov 0x2d(%rdx),%r10 mov %r10,0x2d(%rcx) L(bkP5Q5): mov 0x25(%rdx),%r9 mov %r9,0x25(%rcx) L(bkP5Q4): mov 0x1d(%rdx),%r11 mov %r11,0x1d(%rcx) L(bkP5Q3): mov 0x15(%rdx),%r10 mov %r10,0x15(%rcx) L(bkP5Q2): mov 0xd(%rdx),%r9 mov %r9,0xd(%rcx) L(bkP5Q1): mov 0x5(%rdx),%r11 mov %r11,0x5(%rcx) L(bkP5Q0): # trailing loads/stores do all their loads 1st, then do the stores mov 0x1(%rdx),%r9d mov %r9d,0x1(%rcx) mov (%rdx),%r10b mov %r10b,(%rcx) ret .balign 16 L(bkP6QI): mov 0x8e(%rdx),%r10 mov %r10,0x8e(%rcx) L(bkP6QH): mov 0x86(%rdx),%r11 mov %r11,0x86(%rcx) L(bkP6QG): mov 0x7e(%rdx),%r10 mov %r10,0x7e(%rcx) L(bkP6QF): mov 0x76(%rdx),%r9 mov %r9,0x76(%rcx) L(bkP6QE): mov 0x6e(%rdx),%r11 mov %r11,0x6e(%rcx) L(bkP6QD): mov 0x66(%rdx),%r10 mov %r10,0x66(%rcx) L(bkP6QC): mov 0x5e(%rdx),%r9 mov %r9,0x5e(%rcx) L(bkP6QB): mov 0x56(%rdx),%r11 mov %r11,0x56(%rcx) L(bkP6QA): mov 0x4e(%rdx),%r10 mov %r10,0x4e(%rcx) L(bkP6Q9): mov 0x46(%rdx),%r9 mov %r9,0x46(%rcx) L(bkP6Q8): mov 0x3e(%rdx),%r11 mov %r11,0x3e(%rcx) L(bkP6Q7): mov 0x36(%rdx),%r10 mov %r10,0x36(%rcx) L(bkP6Q6): mov 0x2e(%rdx),%r9 mov %r9,0x2e(%rcx) L(bkP6Q5): mov 0x26(%rdx),%r11 mov %r11,0x26(%rcx) L(bkP6Q4): mov 0x1e(%rdx),%r10 mov %r10,0x1e(%rcx) L(bkP6Q3): mov 0x16(%rdx),%r9 mov %r9,0x16(%rcx) L(bkP6Q2): mov 0xe(%rdx),%r11 mov %r11,0xe(%rcx) L(bkP6Q1): mov 0x6(%rdx),%r10 mov %r10,0x6(%rcx) L(bkP6Q0): # trailing loads/stores do all their loads 1st, then do the stores mov 0x2(%rdx),%r9d mov %r9d,0x2(%rcx) mov (%rdx),%r10w mov %r10w,(%rcx) ret .balign 16 L(bkP7QI): mov 0x8f(%rdx),%r10 mov %r10,0x8f(%rcx) L(bkP7QH): mov 0x87(%rdx),%r11 mov %r11,0x87(%rcx) L(bkP7QG): mov 0x7f(%rdx),%r10 mov %r10,0x7f(%rcx) L(bkP7QF): mov 0x77(%rdx),%r9 mov %r9,0x77(%rcx) L(bkP7QE): mov 0x6f(%rdx),%r11 mov %r11,0x6f(%rcx) L(bkP7QD): mov 0x67(%rdx),%r10 mov %r10,0x67(%rcx) L(bkP7QC): mov 0x5f(%rdx),%r9 mov %r9,0x5f(%rcx) L(bkP7QB): mov 0x57(%rdx),%r11 mov %r11,0x57(%rcx) L(bkP7QA): mov 0x4f(%rdx),%r10 mov %r10,0x4f(%rcx) L(bkP7Q9): mov 0x47(%rdx),%r9 mov %r9,0x47(%rcx) L(bkP7Q8): mov 0x3f(%rdx),%r11 mov %r11,0x3f(%rcx) L(bkP7Q7): mov 0x37(%rdx),%r10 mov %r10,0x37(%rcx) L(bkP7Q6): mov 0x2f(%rdx),%r9 mov %r9,0x2f(%rcx) L(bkP7Q5): mov 0x27(%rdx),%r11 mov %r11,0x27(%rcx) L(bkP7Q4): mov 0x1f(%rdx),%r10 mov %r10,0x1f(%rcx) L(bkP7Q3): mov 0x17(%rdx),%r9 mov %r9,0x17(%rcx) L(bkP7Q2): mov 0xf(%rdx),%r11 mov %r11,0xf(%rcx) L(bkP7Q1): mov 0x7(%rdx),%r10 mov %r10,0x7(%rcx) L(bkP7Q0): # trailing loads/stores do all their loads 1st, then do the stores mov 0x3(%rdx),%r9d mov %r9d,0x3(%rcx) mov 0x1(%rdx),%r10w mov %r10w,0x1(%rcx) mov (%rdx),%r11b mov %r11b,(%rcx) ret .balign 16 L(bkPxQx): .int L(bkP0Q0)-L(bkPxQx) .int L(bkP1Q0)-L(bkPxQx) .int L(bkP2Q0)-L(bkPxQx) .int L(bkP3Q0)-L(bkPxQx) .int L(bkP4Q0)-L(bkPxQx) .int L(bkP5Q0)-L(bkPxQx) .int L(bkP6Q0)-L(bkPxQx) .int L(bkP7Q0)-L(bkPxQx) .int L(bkP0Q1)-L(bkPxQx) .int L(bkP1Q1)-L(bkPxQx) .int L(bkP2Q1)-L(bkPxQx) .int L(bkP3Q1)-L(bkPxQx) .int L(bkP4Q1)-L(bkPxQx) .int L(bkP5Q1)-L(bkPxQx) .int L(bkP6Q1)-L(bkPxQx) .int L(bkP7Q1)-L(bkPxQx) .int L(bkP0Q2)-L(bkPxQx) .int L(bkP1Q2)-L(bkPxQx) .int L(bkP2Q2)-L(bkPxQx) .int L(bkP3Q2)-L(bkPxQx) .int L(bkP4Q2)-L(bkPxQx) .int L(bkP5Q2)-L(bkPxQx) .int L(bkP6Q2)-L(bkPxQx) .int L(bkP7Q2)-L(bkPxQx) .int L(bkP0Q3)-L(bkPxQx) .int L(bkP1Q3)-L(bkPxQx) .int L(bkP2Q3)-L(bkPxQx) .int L(bkP3Q3)-L(bkPxQx) .int L(bkP4Q3)-L(bkPxQx) .int L(bkP5Q3)-L(bkPxQx) .int L(bkP6Q3)-L(bkPxQx) .int L(bkP7Q3)-L(bkPxQx) .int L(bkP0Q4)-L(bkPxQx) .int L(bkP1Q4)-L(bkPxQx) .int L(bkP2Q4)-L(bkPxQx) .int L(bkP3Q4)-L(bkPxQx) .int L(bkP4Q4)-L(bkPxQx) .int L(bkP5Q4)-L(bkPxQx) .int L(bkP6Q4)-L(bkPxQx) .int L(bkP7Q4)-L(bkPxQx) .int L(bkP0Q5)-L(bkPxQx) .int L(bkP1Q5)-L(bkPxQx) .int L(bkP2Q5)-L(bkPxQx) .int L(bkP3Q5)-L(bkPxQx) .int L(bkP4Q5)-L(bkPxQx) .int L(bkP5Q5)-L(bkPxQx) .int L(bkP6Q5)-L(bkPxQx) .int L(bkP7Q5)-L(bkPxQx) .int L(bkP0Q6)-L(bkPxQx) .int L(bkP1Q6)-L(bkPxQx) .int L(bkP2Q6)-L(bkPxQx) .int L(bkP3Q6)-L(bkPxQx) .int L(bkP4Q6)-L(bkPxQx) .int L(bkP5Q6)-L(bkPxQx) .int L(bkP6Q6)-L(bkPxQx) .int L(bkP7Q6)-L(bkPxQx) .int L(bkP0Q7)-L(bkPxQx) .int L(bkP1Q7)-L(bkPxQx) .int L(bkP2Q7)-L(bkPxQx) .int L(bkP3Q7)-L(bkPxQx) .int L(bkP4Q7)-L(bkPxQx) .int L(bkP5Q7)-L(bkPxQx) .int L(bkP6Q7)-L(bkPxQx) .int L(bkP7Q7)-L(bkPxQx) .int L(bkP0Q8)-L(bkPxQx) .int L(bkP1Q8)-L(bkPxQx) .int L(bkP2Q8)-L(bkPxQx) .int L(bkP3Q8)-L(bkPxQx) .int L(bkP4Q8)-L(bkPxQx) .int L(bkP5Q8)-L(bkPxQx) .int L(bkP6Q8)-L(bkPxQx) .int L(bkP7Q8)-L(bkPxQx) .int L(bkP0Q9)-L(bkPxQx) .int L(bkP1Q9)-L(bkPxQx) .int L(bkP2Q9)-L(bkPxQx) .int L(bkP3Q9)-L(bkPxQx) .int L(bkP4Q9)-L(bkPxQx) .int L(bkP5Q9)-L(bkPxQx) .int L(bkP6Q9)-L(bkPxQx) .int L(bkP7Q9)-L(bkPxQx) .int L(bkP0QA)-L(bkPxQx) .int L(bkP1QA)-L(bkPxQx) .int L(bkP2QA)-L(bkPxQx) .int L(bkP3QA)-L(bkPxQx) .int L(bkP4QA)-L(bkPxQx) .int L(bkP5QA)-L(bkPxQx) .int L(bkP6QA)-L(bkPxQx) .int L(bkP7QA)-L(bkPxQx) .int L(bkP0QB)-L(bkPxQx) .int L(bkP1QB)-L(bkPxQx) .int L(bkP2QB)-L(bkPxQx) .int L(bkP3QB)-L(bkPxQx) .int L(bkP4QB)-L(bkPxQx) .int L(bkP5QB)-L(bkPxQx) .int L(bkP6QB)-L(bkPxQx) .int L(bkP7QB)-L(bkPxQx) .int L(bkP0QC)-L(bkPxQx) .int L(bkP1QC)-L(bkPxQx) .int L(bkP2QC)-L(bkPxQx) .int L(bkP3QC)-L(bkPxQx) .int L(bkP4QC)-L(bkPxQx) .int L(bkP5QC)-L(bkPxQx) .int L(bkP6QC)-L(bkPxQx) .int L(bkP7QC)-L(bkPxQx) .int L(bkP0QD)-L(bkPxQx) .int L(bkP1QD)-L(bkPxQx) .int L(bkP2QD)-L(bkPxQx) .int L(bkP3QD)-L(bkPxQx) .int L(bkP4QD)-L(bkPxQx) .int L(bkP5QD)-L(bkPxQx) .int L(bkP6QD)-L(bkPxQx) .int L(bkP7QD)-L(bkPxQx) .int L(bkP0QE)-L(bkPxQx) .int L(bkP1QE)-L(bkPxQx) .int L(bkP2QE)-L(bkPxQx) .int L(bkP3QE)-L(bkPxQx) .int L(bkP4QE)-L(bkPxQx) .int L(bkP5QE)-L(bkPxQx) .int L(bkP6QE)-L(bkPxQx) .int L(bkP7QE)-L(bkPxQx) .int L(bkP0QF)-L(bkPxQx) .int L(bkP1QF)-L(bkPxQx) .int L(bkP2QF)-L(bkPxQx) .int L(bkP3QF)-L(bkPxQx) .int L(bkP4QF)-L(bkPxQx) .int L(bkP5QF)-L(bkPxQx) .int L(bkP6QF)-L(bkPxQx) .int L(bkP7QF)-L(bkPxQx) .int L(bkP0QG)-L(bkPxQx) .int L(bkP1QG)-L(bkPxQx) .int L(bkP2QG)-L(bkPxQx) .int L(bkP3QG)-L(bkPxQx) .int L(bkP4QG)-L(bkPxQx) .int L(bkP5QG)-L(bkPxQx) .int L(bkP6QG)-L(bkPxQx) .int L(bkP7QG)-L(bkPxQx) .int L(bkP0QH)-L(bkPxQx) .int L(bkP1QH)-L(bkPxQx) .int L(bkP2QH)-L(bkPxQx) .int L(bkP3QH)-L(bkPxQx) .int L(bkP4QH)-L(bkPxQx) .int L(bkP5QH)-L(bkPxQx) .int L(bkP6QH)-L(bkPxQx) .int L(bkP7QH)-L(bkPxQx) .int L(bkP0QI)-L(bkPxQx) .int L(bkP1QI)-L(bkPxQx) .int L(bkP2QI)-L(bkPxQx) .int L(bkP3QI)-L(bkPxQx) .int L(bkP4QI)-L(bkPxQx) .int L(bkP5QI)-L(bkPxQx) .int L(bkP6QI)-L(bkPxQx) .int L(bkP7QI)-L(bkPxQx) SET_SIZE(memmove)