| /* strcpy with SSSE3 |
| Copyright (C) 2011-2014 Free Software Foundation, Inc. |
| Contributed by Intel Corporation. |
| This file is part of the GNU C Library. |
| |
| The GNU C Library is free software; you can redistribute it and/or |
| modify it under the terms of the GNU Lesser General Public |
| License as published by the Free Software Foundation; either |
| version 2.1 of the License, or (at your option) any later version. |
| |
| The GNU C Library is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| Lesser General Public License for more details. |
| |
| You should have received a copy of the GNU Lesser General Public |
| License along with the GNU C Library; if not, see |
| <http://www.gnu.org/licenses/>. */ |
| |
| #ifndef NOT_IN_libc |
| |
| # ifndef USE_AS_STRCAT |
| # include <sysdep.h> |
| |
| # ifndef STRCPY |
| # define STRCPY __strcpy_ssse3 |
| # endif |
| |
| .section .text.ssse3,"ax",@progbits |
| ENTRY (STRCPY) |
| |
| mov %rsi, %rcx |
| # ifdef USE_AS_STRNCPY |
| mov %rdx, %r8 |
| # endif |
| mov %rdi, %rdx |
| # ifdef USE_AS_STRNCPY |
| test %r8, %r8 |
| jz L(Exit0) |
| cmp $8, %r8 |
| jbe L(StrncpyExit8Bytes) |
| # endif |
| cmpb $0, (%rcx) |
| jz L(Exit1) |
| cmpb $0, 1(%rcx) |
| jz L(Exit2) |
| cmpb $0, 2(%rcx) |
| jz L(Exit3) |
| cmpb $0, 3(%rcx) |
| jz L(Exit4) |
| cmpb $0, 4(%rcx) |
| jz L(Exit5) |
| cmpb $0, 5(%rcx) |
| jz L(Exit6) |
| cmpb $0, 6(%rcx) |
| jz L(Exit7) |
| cmpb $0, 7(%rcx) |
| jz L(Exit8) |
| # ifdef USE_AS_STRNCPY |
| cmp $16, %r8 |
| jb L(StrncpyExit15Bytes) |
| # endif |
| cmpb $0, 8(%rcx) |
| jz L(Exit9) |
| cmpb $0, 9(%rcx) |
| jz L(Exit10) |
| cmpb $0, 10(%rcx) |
| jz L(Exit11) |
| cmpb $0, 11(%rcx) |
| jz L(Exit12) |
| cmpb $0, 12(%rcx) |
| jz L(Exit13) |
| cmpb $0, 13(%rcx) |
| jz L(Exit14) |
| cmpb $0, 14(%rcx) |
| jz L(Exit15) |
| # ifdef USE_AS_STRNCPY |
| cmp $16, %r8 |
| je L(Exit16) |
| # endif |
| cmpb $0, 15(%rcx) |
| jz L(Exit16) |
| # endif |
| |
| # ifdef USE_AS_STRNCPY |
| mov %rcx, %rsi |
| sub $16, %r8 |
| and $0xf, %rsi |
| |
| /* add 16 bytes rcx_offset to r8 */ |
| |
| add %rsi, %r8 |
| # endif |
| lea 16(%rcx), %rsi |
| and $-16, %rsi |
| pxor %xmm0, %xmm0 |
| mov (%rcx), %r9 |
| mov %r9, (%rdx) |
| pcmpeqb (%rsi), %xmm0 |
| mov 8(%rcx), %r9 |
| mov %r9, 8(%rdx) |
| |
| /* convert byte mask in xmm0 to bit mask */ |
| |
| pmovmskb %xmm0, %rax |
| sub %rcx, %rsi |
| |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(CopyFrom1To16BytesCase2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(CopyFrom1To16Bytes) |
| |
| mov %rdx, %rax |
| lea 16(%rdx), %rdx |
| and $-16, %rdx |
| sub %rdx, %rax |
| |
| # ifdef USE_AS_STRNCPY |
| add %rax, %rsi |
| lea -1(%rsi), %rsi |
| and $1<<31, %esi |
| test %rsi, %rsi |
| jnz L(ContinueCopy) |
| lea 16(%r8), %r8 |
| |
| L(ContinueCopy): |
| # endif |
| sub %rax, %rcx |
| mov %rcx, %rax |
| and $0xf, %rax |
| mov $0, %rsi |
| |
| /* case: rcx_offset == rdx_offset */ |
| |
| jz L(Align16Both) |
| |
| cmp $8, %rax |
| jae L(ShlHigh8) |
| cmp $1, %rax |
| je L(Shl1) |
| cmp $2, %rax |
| je L(Shl2) |
| cmp $3, %rax |
| je L(Shl3) |
| cmp $4, %rax |
| je L(Shl4) |
| cmp $5, %rax |
| je L(Shl5) |
| cmp $6, %rax |
| je L(Shl6) |
| jmp L(Shl7) |
| |
| L(ShlHigh8): |
| je L(Shl8) |
| cmp $9, %rax |
| je L(Shl9) |
| cmp $10, %rax |
| je L(Shl10) |
| cmp $11, %rax |
| je L(Shl11) |
| cmp $12, %rax |
| je L(Shl12) |
| cmp $13, %rax |
| je L(Shl13) |
| cmp $14, %rax |
| je L(Shl14) |
| jmp L(Shl15) |
| |
| L(Align16Both): |
| movaps (%rcx), %xmm1 |
| movaps 16(%rcx), %xmm2 |
| movaps %xmm1, (%rdx) |
| pcmpeqb %xmm2, %xmm0 |
| pmovmskb %xmm0, %rax |
| lea 16(%rsi), %rsi |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(CopyFrom1To16BytesCase2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(CopyFrom1To16Bytes) |
| |
| movaps 16(%rcx, %rsi), %xmm3 |
| movaps %xmm2, (%rdx, %rsi) |
| pcmpeqb %xmm3, %xmm0 |
| pmovmskb %xmm0, %rax |
| lea 16(%rsi), %rsi |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(CopyFrom1To16BytesCase2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(CopyFrom1To16Bytes) |
| |
| movaps 16(%rcx, %rsi), %xmm4 |
| movaps %xmm3, (%rdx, %rsi) |
| pcmpeqb %xmm4, %xmm0 |
| pmovmskb %xmm0, %rax |
| lea 16(%rsi), %rsi |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(CopyFrom1To16BytesCase2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(CopyFrom1To16Bytes) |
| |
| movaps 16(%rcx, %rsi), %xmm1 |
| movaps %xmm4, (%rdx, %rsi) |
| pcmpeqb %xmm1, %xmm0 |
| pmovmskb %xmm0, %rax |
| lea 16(%rsi), %rsi |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(CopyFrom1To16BytesCase2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(CopyFrom1To16Bytes) |
| |
| movaps 16(%rcx, %rsi), %xmm2 |
| movaps %xmm1, (%rdx, %rsi) |
| pcmpeqb %xmm2, %xmm0 |
| pmovmskb %xmm0, %rax |
| lea 16(%rsi), %rsi |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(CopyFrom1To16BytesCase2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(CopyFrom1To16Bytes) |
| |
| movaps 16(%rcx, %rsi), %xmm3 |
| movaps %xmm2, (%rdx, %rsi) |
| pcmpeqb %xmm3, %xmm0 |
| pmovmskb %xmm0, %rax |
| lea 16(%rsi), %rsi |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(CopyFrom1To16BytesCase2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(CopyFrom1To16Bytes) |
| |
| movaps %xmm3, (%rdx, %rsi) |
| mov %rcx, %rax |
| lea 16(%rcx, %rsi), %rcx |
| and $-0x40, %rcx |
| sub %rcx, %rax |
| sub %rax, %rdx |
| # ifdef USE_AS_STRNCPY |
| lea 112(%r8, %rax), %r8 |
| # endif |
| mov $-0x40, %rsi |
| |
| .p2align 4 |
| L(Aligned64Loop): |
| movaps (%rcx), %xmm2 |
| movaps %xmm2, %xmm4 |
| movaps 16(%rcx), %xmm5 |
| movaps 32(%rcx), %xmm3 |
| movaps %xmm3, %xmm6 |
| movaps 48(%rcx), %xmm7 |
| pminub %xmm5, %xmm2 |
| pminub %xmm7, %xmm3 |
| pminub %xmm2, %xmm3 |
| pcmpeqb %xmm0, %xmm3 |
| pmovmskb %xmm3, %rax |
| lea 64(%rdx), %rdx |
| lea 64(%rcx), %rcx |
| # ifdef USE_AS_STRNCPY |
| sub $64, %r8 |
| jbe L(StrncpyLeaveCase2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Aligned64Leave) |
| movaps %xmm4, -64(%rdx) |
| movaps %xmm5, -48(%rdx) |
| movaps %xmm6, -32(%rdx) |
| movaps %xmm7, -16(%rdx) |
| jmp L(Aligned64Loop) |
| |
| L(Aligned64Leave): |
| # ifdef USE_AS_STRNCPY |
| lea 48(%r8), %r8 |
| # endif |
| pcmpeqb %xmm4, %xmm0 |
| pmovmskb %xmm0, %rax |
| test %rax, %rax |
| jnz L(CopyFrom1To16Bytes) |
| |
| pcmpeqb %xmm5, %xmm0 |
| # ifdef USE_AS_STRNCPY |
| lea -16(%r8), %r8 |
| # endif |
| pmovmskb %xmm0, %rax |
| movaps %xmm4, -64(%rdx) |
| test %rax, %rax |
| lea 16(%rsi), %rsi |
| jnz L(CopyFrom1To16Bytes) |
| |
| pcmpeqb %xmm6, %xmm0 |
| # ifdef USE_AS_STRNCPY |
| lea -16(%r8), %r8 |
| # endif |
| pmovmskb %xmm0, %rax |
| movaps %xmm5, -48(%rdx) |
| test %rax, %rax |
| lea 16(%rsi), %rsi |
| jnz L(CopyFrom1To16Bytes) |
| |
| movaps %xmm6, -32(%rdx) |
| pcmpeqb %xmm7, %xmm0 |
| # ifdef USE_AS_STRNCPY |
| lea -16(%r8), %r8 |
| # endif |
| pmovmskb %xmm0, %rax |
| lea 16(%rsi), %rsi |
| jmp L(CopyFrom1To16Bytes) |
| |
| .p2align 4 |
| L(Shl1): |
| movaps -1(%rcx), %xmm1 |
| movaps 15(%rcx), %xmm2 |
| L(Shl1Start): |
| pcmpeqb %xmm2, %xmm0 |
| pmovmskb %xmm0, %rax |
| movaps %xmm2, %xmm3 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit1Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl1LoopExit) |
| |
| palignr $1, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 31(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| movaps %xmm2, %xmm1 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit1Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl1LoopExit) |
| |
| palignr $1, %xmm3, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 31(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| movaps %xmm2, %xmm3 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit1Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl1LoopExit) |
| |
| palignr $1, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 31(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit1Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl1LoopExit) |
| |
| palignr $1, %xmm3, %xmm2 |
| movaps %xmm2, (%rdx) |
| lea 31(%rcx), %rcx |
| lea 16(%rdx), %rdx |
| |
| mov %rcx, %rax |
| and $-0x40, %rcx |
| sub %rcx, %rax |
| lea -15(%rcx), %rcx |
| sub %rax, %rdx |
| # ifdef USE_AS_STRNCPY |
| add %rax, %r8 |
| # endif |
| movaps -1(%rcx), %xmm1 |
| |
| /* 64 bytes loop */ |
| .p2align 4 |
| L(Shl1LoopStart): |
| movaps 15(%rcx), %xmm2 |
| movaps 31(%rcx), %xmm3 |
| movaps %xmm3, %xmm6 |
| movaps 47(%rcx), %xmm4 |
| movaps %xmm4, %xmm7 |
| movaps 63(%rcx), %xmm5 |
| pminub %xmm2, %xmm6 |
| pminub %xmm5, %xmm7 |
| pminub %xmm6, %xmm7 |
| pcmpeqb %xmm0, %xmm7 |
| pmovmskb %xmm7, %rax |
| movaps %xmm5, %xmm7 |
| palignr $1, %xmm4, %xmm5 |
| test %rax, %rax |
| palignr $1, %xmm3, %xmm4 |
| jnz L(Shl1Start) |
| # ifdef USE_AS_STRNCPY |
| sub $64, %r8 |
| jbe L(StrncpyLeave1) |
| # endif |
| palignr $1, %xmm2, %xmm3 |
| lea 64(%rcx), %rcx |
| palignr $1, %xmm1, %xmm2 |
| movaps %xmm7, %xmm1 |
| movaps %xmm5, 48(%rdx) |
| movaps %xmm4, 32(%rdx) |
| movaps %xmm3, 16(%rdx) |
| movaps %xmm2, (%rdx) |
| lea 64(%rdx), %rdx |
| jmp L(Shl1LoopStart) |
| |
| L(Shl1LoopExit): |
| movdqu -1(%rcx), %xmm1 |
| mov $15, %rsi |
| movdqu %xmm1, -1(%rdx) |
| jmp L(CopyFrom1To16Bytes) |
| |
| .p2align 4 |
| L(Shl2): |
| movaps -2(%rcx), %xmm1 |
| movaps 14(%rcx), %xmm2 |
| L(Shl2Start): |
| pcmpeqb %xmm2, %xmm0 |
| pmovmskb %xmm0, %rax |
| movaps %xmm2, %xmm3 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit2Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl2LoopExit) |
| |
| palignr $2, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 30(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| movaps %xmm2, %xmm1 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit2Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl2LoopExit) |
| |
| palignr $2, %xmm3, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 30(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| movaps %xmm2, %xmm3 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit2Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl2LoopExit) |
| |
| palignr $2, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 30(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit2Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl2LoopExit) |
| |
| palignr $2, %xmm3, %xmm2 |
| movaps %xmm2, (%rdx) |
| lea 30(%rcx), %rcx |
| lea 16(%rdx), %rdx |
| |
| mov %rcx, %rax |
| and $-0x40, %rcx |
| sub %rcx, %rax |
| lea -14(%rcx), %rcx |
| sub %rax, %rdx |
| # ifdef USE_AS_STRNCPY |
| add %rax, %r8 |
| # endif |
| movaps -2(%rcx), %xmm1 |
| |
| /* 64 bytes loop */ |
| .p2align 4 |
| L(Shl2LoopStart): |
| movaps 14(%rcx), %xmm2 |
| movaps 30(%rcx), %xmm3 |
| movaps %xmm3, %xmm6 |
| movaps 46(%rcx), %xmm4 |
| movaps %xmm4, %xmm7 |
| movaps 62(%rcx), %xmm5 |
| pminub %xmm2, %xmm6 |
| pminub %xmm5, %xmm7 |
| pminub %xmm6, %xmm7 |
| pcmpeqb %xmm0, %xmm7 |
| pmovmskb %xmm7, %rax |
| movaps %xmm5, %xmm7 |
| palignr $2, %xmm4, %xmm5 |
| test %rax, %rax |
| palignr $2, %xmm3, %xmm4 |
| jnz L(Shl2Start) |
| # ifdef USE_AS_STRNCPY |
| sub $64, %r8 |
| jbe L(StrncpyLeave2) |
| # endif |
| palignr $2, %xmm2, %xmm3 |
| lea 64(%rcx), %rcx |
| palignr $2, %xmm1, %xmm2 |
| movaps %xmm7, %xmm1 |
| movaps %xmm5, 48(%rdx) |
| movaps %xmm4, 32(%rdx) |
| movaps %xmm3, 16(%rdx) |
| movaps %xmm2, (%rdx) |
| lea 64(%rdx), %rdx |
| jmp L(Shl2LoopStart) |
| |
| L(Shl2LoopExit): |
| movdqu -2(%rcx), %xmm1 |
| mov $14, %rsi |
| movdqu %xmm1, -2(%rdx) |
| jmp L(CopyFrom1To16Bytes) |
| |
| .p2align 4 |
| L(Shl3): |
| movaps -3(%rcx), %xmm1 |
| movaps 13(%rcx), %xmm2 |
| L(Shl3Start): |
| pcmpeqb %xmm2, %xmm0 |
| pmovmskb %xmm0, %rax |
| movaps %xmm2, %xmm3 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit3Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl3LoopExit) |
| |
| palignr $3, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 29(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| movaps %xmm2, %xmm1 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit3Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl3LoopExit) |
| |
| palignr $3, %xmm3, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 29(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| movaps %xmm2, %xmm3 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit3Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl3LoopExit) |
| |
| palignr $3, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 29(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit3Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl3LoopExit) |
| |
| palignr $3, %xmm3, %xmm2 |
| movaps %xmm2, (%rdx) |
| lea 29(%rcx), %rcx |
| lea 16(%rdx), %rdx |
| |
| mov %rcx, %rax |
| and $-0x40, %rcx |
| sub %rcx, %rax |
| lea -13(%rcx), %rcx |
| sub %rax, %rdx |
| # ifdef USE_AS_STRNCPY |
| add %rax, %r8 |
| # endif |
| movaps -3(%rcx), %xmm1 |
| |
| /* 64 bytes loop */ |
| .p2align 4 |
| L(Shl3LoopStart): |
| movaps 13(%rcx), %xmm2 |
| movaps 29(%rcx), %xmm3 |
| movaps %xmm3, %xmm6 |
| movaps 45(%rcx), %xmm4 |
| movaps %xmm4, %xmm7 |
| movaps 61(%rcx), %xmm5 |
| pminub %xmm2, %xmm6 |
| pminub %xmm5, %xmm7 |
| pminub %xmm6, %xmm7 |
| pcmpeqb %xmm0, %xmm7 |
| pmovmskb %xmm7, %rax |
| movaps %xmm5, %xmm7 |
| palignr $3, %xmm4, %xmm5 |
| test %rax, %rax |
| palignr $3, %xmm3, %xmm4 |
| jnz L(Shl3Start) |
| # ifdef USE_AS_STRNCPY |
| sub $64, %r8 |
| jbe L(StrncpyLeave3) |
| # endif |
| palignr $3, %xmm2, %xmm3 |
| lea 64(%rcx), %rcx |
| palignr $3, %xmm1, %xmm2 |
| movaps %xmm7, %xmm1 |
| movaps %xmm5, 48(%rdx) |
| movaps %xmm4, 32(%rdx) |
| movaps %xmm3, 16(%rdx) |
| movaps %xmm2, (%rdx) |
| lea 64(%rdx), %rdx |
| jmp L(Shl3LoopStart) |
| |
| L(Shl3LoopExit): |
| movdqu -3(%rcx), %xmm1 |
| mov $13, %rsi |
| movdqu %xmm1, -3(%rdx) |
| jmp L(CopyFrom1To16Bytes) |
| |
| .p2align 4 |
| L(Shl4): |
| movaps -4(%rcx), %xmm1 |
| movaps 12(%rcx), %xmm2 |
| L(Shl4Start): |
| pcmpeqb %xmm2, %xmm0 |
| pmovmskb %xmm0, %rax |
| movaps %xmm2, %xmm3 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit4Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl4LoopExit) |
| |
| palignr $4, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 28(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| movaps %xmm2, %xmm1 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit4Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl4LoopExit) |
| |
| palignr $4, %xmm3, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 28(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| movaps %xmm2, %xmm3 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit4Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl4LoopExit) |
| |
| palignr $4, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 28(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit4Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl4LoopExit) |
| |
| palignr $4, %xmm3, %xmm2 |
| movaps %xmm2, (%rdx) |
| lea 28(%rcx), %rcx |
| lea 16(%rdx), %rdx |
| |
| mov %rcx, %rax |
| and $-0x40, %rcx |
| sub %rcx, %rax |
| lea -12(%rcx), %rcx |
| sub %rax, %rdx |
| # ifdef USE_AS_STRNCPY |
| add %rax, %r8 |
| # endif |
| movaps -4(%rcx), %xmm1 |
| |
| /* 64 bytes loop */ |
| .p2align 4 |
| L(Shl4LoopStart): |
| movaps 12(%rcx), %xmm2 |
| movaps 28(%rcx), %xmm3 |
| movaps %xmm3, %xmm6 |
| movaps 44(%rcx), %xmm4 |
| movaps %xmm4, %xmm7 |
| movaps 60(%rcx), %xmm5 |
| pminub %xmm2, %xmm6 |
| pminub %xmm5, %xmm7 |
| pminub %xmm6, %xmm7 |
| pcmpeqb %xmm0, %xmm7 |
| pmovmskb %xmm7, %rax |
| movaps %xmm5, %xmm7 |
| palignr $4, %xmm4, %xmm5 |
| test %rax, %rax |
| palignr $4, %xmm3, %xmm4 |
| jnz L(Shl4Start) |
| # ifdef USE_AS_STRNCPY |
| sub $64, %r8 |
| jbe L(StrncpyLeave4) |
| # endif |
| palignr $4, %xmm2, %xmm3 |
| lea 64(%rcx), %rcx |
| palignr $4, %xmm1, %xmm2 |
| movaps %xmm7, %xmm1 |
| movaps %xmm5, 48(%rdx) |
| movaps %xmm4, 32(%rdx) |
| movaps %xmm3, 16(%rdx) |
| movaps %xmm2, (%rdx) |
| lea 64(%rdx), %rdx |
| jmp L(Shl4LoopStart) |
| |
| L(Shl4LoopExit): |
| movdqu -4(%rcx), %xmm1 |
| mov $12, %rsi |
| movdqu %xmm1, -4(%rdx) |
| jmp L(CopyFrom1To16Bytes) |
| |
| .p2align 4 |
| L(Shl5): |
| movaps -5(%rcx), %xmm1 |
| movaps 11(%rcx), %xmm2 |
| L(Shl5Start): |
| pcmpeqb %xmm2, %xmm0 |
| pmovmskb %xmm0, %rax |
| movaps %xmm2, %xmm3 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit5Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl5LoopExit) |
| |
| palignr $5, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 27(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| movaps %xmm2, %xmm1 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit5Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl5LoopExit) |
| |
| palignr $5, %xmm3, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 27(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| movaps %xmm2, %xmm3 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit5Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl5LoopExit) |
| |
| palignr $5, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 27(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit5Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl5LoopExit) |
| |
| palignr $5, %xmm3, %xmm2 |
| movaps %xmm2, (%rdx) |
| lea 27(%rcx), %rcx |
| lea 16(%rdx), %rdx |
| |
| mov %rcx, %rax |
| and $-0x40, %rcx |
| sub %rcx, %rax |
| lea -11(%rcx), %rcx |
| sub %rax, %rdx |
| # ifdef USE_AS_STRNCPY |
| add %rax, %r8 |
| # endif |
| movaps -5(%rcx), %xmm1 |
| |
| /* 64 bytes loop */ |
| .p2align 4 |
| L(Shl5LoopStart): |
| movaps 11(%rcx), %xmm2 |
| movaps 27(%rcx), %xmm3 |
| movaps %xmm3, %xmm6 |
| movaps 43(%rcx), %xmm4 |
| movaps %xmm4, %xmm7 |
| movaps 59(%rcx), %xmm5 |
| pminub %xmm2, %xmm6 |
| pminub %xmm5, %xmm7 |
| pminub %xmm6, %xmm7 |
| pcmpeqb %xmm0, %xmm7 |
| pmovmskb %xmm7, %rax |
| movaps %xmm5, %xmm7 |
| palignr $5, %xmm4, %xmm5 |
| test %rax, %rax |
| palignr $5, %xmm3, %xmm4 |
| jnz L(Shl5Start) |
| # ifdef USE_AS_STRNCPY |
| sub $64, %r8 |
| jbe L(StrncpyLeave5) |
| # endif |
| palignr $5, %xmm2, %xmm3 |
| lea 64(%rcx), %rcx |
| palignr $5, %xmm1, %xmm2 |
| movaps %xmm7, %xmm1 |
| movaps %xmm5, 48(%rdx) |
| movaps %xmm4, 32(%rdx) |
| movaps %xmm3, 16(%rdx) |
| movaps %xmm2, (%rdx) |
| lea 64(%rdx), %rdx |
| jmp L(Shl5LoopStart) |
| |
| L(Shl5LoopExit): |
| movdqu -5(%rcx), %xmm1 |
| mov $11, %rsi |
| movdqu %xmm1, -5(%rdx) |
| jmp L(CopyFrom1To16Bytes) |
| |
| .p2align 4 |
| L(Shl6): |
| movaps -6(%rcx), %xmm1 |
| movaps 10(%rcx), %xmm2 |
| L(Shl6Start): |
| pcmpeqb %xmm2, %xmm0 |
| pmovmskb %xmm0, %rax |
| movaps %xmm2, %xmm3 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit6Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl6LoopExit) |
| |
| palignr $6, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 26(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| movaps %xmm2, %xmm1 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit6Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl6LoopExit) |
| |
| palignr $6, %xmm3, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 26(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| movaps %xmm2, %xmm3 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit6Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl6LoopExit) |
| |
| palignr $6, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 26(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit6Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl6LoopExit) |
| |
| palignr $6, %xmm3, %xmm2 |
| movaps %xmm2, (%rdx) |
| lea 26(%rcx), %rcx |
| lea 16(%rdx), %rdx |
| |
| mov %rcx, %rax |
| and $-0x40, %rcx |
| sub %rcx, %rax |
| lea -10(%rcx), %rcx |
| sub %rax, %rdx |
| # ifdef USE_AS_STRNCPY |
| add %rax, %r8 |
| # endif |
| movaps -6(%rcx), %xmm1 |
| |
| /* 64 bytes loop */ |
| .p2align 4 |
| L(Shl6LoopStart): |
| movaps 10(%rcx), %xmm2 |
| movaps 26(%rcx), %xmm3 |
| movaps %xmm3, %xmm6 |
| movaps 42(%rcx), %xmm4 |
| movaps %xmm4, %xmm7 |
| movaps 58(%rcx), %xmm5 |
| pminub %xmm2, %xmm6 |
| pminub %xmm5, %xmm7 |
| pminub %xmm6, %xmm7 |
| pcmpeqb %xmm0, %xmm7 |
| pmovmskb %xmm7, %rax |
| movaps %xmm5, %xmm7 |
| palignr $6, %xmm4, %xmm5 |
| test %rax, %rax |
| palignr $6, %xmm3, %xmm4 |
| jnz L(Shl6Start) |
| # ifdef USE_AS_STRNCPY |
| sub $64, %r8 |
| jbe L(StrncpyLeave6) |
| # endif |
| palignr $6, %xmm2, %xmm3 |
| lea 64(%rcx), %rcx |
| palignr $6, %xmm1, %xmm2 |
| movaps %xmm7, %xmm1 |
| movaps %xmm5, 48(%rdx) |
| movaps %xmm4, 32(%rdx) |
| movaps %xmm3, 16(%rdx) |
| movaps %xmm2, (%rdx) |
| lea 64(%rdx), %rdx |
| jmp L(Shl6LoopStart) |
| |
| L(Shl6LoopExit): |
| mov (%rcx), %r9 |
| mov 6(%rcx), %esi |
| mov %r9, (%rdx) |
| mov %esi, 6(%rdx) |
| mov $10, %rsi |
| jmp L(CopyFrom1To16Bytes) |
| |
| .p2align 4 |
| L(Shl7): |
| movaps -7(%rcx), %xmm1 |
| movaps 9(%rcx), %xmm2 |
| L(Shl7Start): |
| pcmpeqb %xmm2, %xmm0 |
| pmovmskb %xmm0, %rax |
| movaps %xmm2, %xmm3 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit7Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl7LoopExit) |
| |
| palignr $7, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 25(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| movaps %xmm2, %xmm1 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit7Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl7LoopExit) |
| |
| palignr $7, %xmm3, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 25(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| movaps %xmm2, %xmm3 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit7Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl7LoopExit) |
| |
| palignr $7, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 25(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit7Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl7LoopExit) |
| |
| palignr $7, %xmm3, %xmm2 |
| movaps %xmm2, (%rdx) |
| lea 25(%rcx), %rcx |
| lea 16(%rdx), %rdx |
| |
| mov %rcx, %rax |
| and $-0x40, %rcx |
| sub %rcx, %rax |
| lea -9(%rcx), %rcx |
| sub %rax, %rdx |
| # ifdef USE_AS_STRNCPY |
| add %rax, %r8 |
| # endif |
| movaps -7(%rcx), %xmm1 |
| |
| /* 64 bytes loop */ |
| .p2align 4 |
| L(Shl7LoopStart): |
| movaps 9(%rcx), %xmm2 |
| movaps 25(%rcx), %xmm3 |
| movaps %xmm3, %xmm6 |
| movaps 41(%rcx), %xmm4 |
| movaps %xmm4, %xmm7 |
| movaps 57(%rcx), %xmm5 |
| pminub %xmm2, %xmm6 |
| pminub %xmm5, %xmm7 |
| pminub %xmm6, %xmm7 |
| pcmpeqb %xmm0, %xmm7 |
| pmovmskb %xmm7, %rax |
| movaps %xmm5, %xmm7 |
| palignr $7, %xmm4, %xmm5 |
| test %rax, %rax |
| palignr $7, %xmm3, %xmm4 |
| jnz L(Shl7Start) |
| # ifdef USE_AS_STRNCPY |
| sub $64, %r8 |
| jbe L(StrncpyLeave7) |
| # endif |
| palignr $7, %xmm2, %xmm3 |
| lea 64(%rcx), %rcx |
| palignr $7, %xmm1, %xmm2 |
| movaps %xmm7, %xmm1 |
| movaps %xmm5, 48(%rdx) |
| movaps %xmm4, 32(%rdx) |
| movaps %xmm3, 16(%rdx) |
| movaps %xmm2, (%rdx) |
| lea 64(%rdx), %rdx |
| jmp L(Shl7LoopStart) |
| |
| L(Shl7LoopExit): |
| mov (%rcx), %r9 |
| mov 5(%rcx), %esi |
| mov %r9, (%rdx) |
| mov %esi, 5(%rdx) |
| mov $9, %rsi |
| jmp L(CopyFrom1To16Bytes) |
| |
| .p2align 4 |
| L(Shl8): |
| movaps -8(%rcx), %xmm1 |
| movaps 8(%rcx), %xmm2 |
| L(Shl8Start): |
| pcmpeqb %xmm2, %xmm0 |
| pmovmskb %xmm0, %rax |
| movaps %xmm2, %xmm3 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit8Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl8LoopExit) |
| |
| palignr $8, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 24(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| movaps %xmm2, %xmm1 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit8Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl8LoopExit) |
| |
| palignr $8, %xmm3, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 24(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| movaps %xmm2, %xmm3 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit8Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl8LoopExit) |
| |
| palignr $8, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 24(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit8Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl8LoopExit) |
| |
| palignr $8, %xmm3, %xmm2 |
| movaps %xmm2, (%rdx) |
| lea 24(%rcx), %rcx |
| lea 16(%rdx), %rdx |
| |
| mov %rcx, %rax |
| and $-0x40, %rcx |
| sub %rcx, %rax |
| lea -8(%rcx), %rcx |
| sub %rax, %rdx |
| # ifdef USE_AS_STRNCPY |
| add %rax, %r8 |
| # endif |
| movaps -8(%rcx), %xmm1 |
| |
| /* 64 bytes loop */ |
| .p2align 4 |
| L(Shl8LoopStart): |
| movaps 8(%rcx), %xmm2 |
| movaps 24(%rcx), %xmm3 |
| movaps %xmm3, %xmm6 |
| movaps 40(%rcx), %xmm4 |
| movaps %xmm4, %xmm7 |
| movaps 56(%rcx), %xmm5 |
| pminub %xmm2, %xmm6 |
| pminub %xmm5, %xmm7 |
| pminub %xmm6, %xmm7 |
| pcmpeqb %xmm0, %xmm7 |
| pmovmskb %xmm7, %rax |
| movaps %xmm5, %xmm7 |
| palignr $8, %xmm4, %xmm5 |
| test %rax, %rax |
| palignr $8, %xmm3, %xmm4 |
| jnz L(Shl8Start) |
| # ifdef USE_AS_STRNCPY |
| sub $64, %r8 |
| jbe L(StrncpyLeave8) |
| # endif |
| palignr $8, %xmm2, %xmm3 |
| lea 64(%rcx), %rcx |
| palignr $8, %xmm1, %xmm2 |
| movaps %xmm7, %xmm1 |
| movaps %xmm5, 48(%rdx) |
| movaps %xmm4, 32(%rdx) |
| movaps %xmm3, 16(%rdx) |
| movaps %xmm2, (%rdx) |
| lea 64(%rdx), %rdx |
| jmp L(Shl8LoopStart) |
| |
| L(Shl8LoopExit): |
| mov (%rcx), %r9 |
| mov $8, %rsi |
| mov %r9, (%rdx) |
| jmp L(CopyFrom1To16Bytes) |
| |
| .p2align 4 |
| L(Shl9): |
| movaps -9(%rcx), %xmm1 |
| movaps 7(%rcx), %xmm2 |
| L(Shl9Start): |
| pcmpeqb %xmm2, %xmm0 |
| pmovmskb %xmm0, %rax |
| movaps %xmm2, %xmm3 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit9Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl9LoopExit) |
| |
| palignr $9, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 23(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| movaps %xmm2, %xmm1 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit9Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl9LoopExit) |
| |
| palignr $9, %xmm3, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 23(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| movaps %xmm2, %xmm3 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit9Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl9LoopExit) |
| |
| palignr $9, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 23(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit9Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl9LoopExit) |
| |
| palignr $9, %xmm3, %xmm2 |
| movaps %xmm2, (%rdx) |
| lea 23(%rcx), %rcx |
| lea 16(%rdx), %rdx |
| |
| mov %rcx, %rax |
| and $-0x40, %rcx |
| sub %rcx, %rax |
| lea -7(%rcx), %rcx |
| sub %rax, %rdx |
| # ifdef USE_AS_STRNCPY |
| add %rax, %r8 |
| # endif |
| movaps -9(%rcx), %xmm1 |
| |
| /* 64 bytes loop */ |
| .p2align 4 |
| L(Shl9LoopStart): |
| movaps 7(%rcx), %xmm2 |
| movaps 23(%rcx), %xmm3 |
| movaps %xmm3, %xmm6 |
| movaps 39(%rcx), %xmm4 |
| movaps %xmm4, %xmm7 |
| movaps 55(%rcx), %xmm5 |
| pminub %xmm2, %xmm6 |
| pminub %xmm5, %xmm7 |
| pminub %xmm6, %xmm7 |
| pcmpeqb %xmm0, %xmm7 |
| pmovmskb %xmm7, %rax |
| movaps %xmm5, %xmm7 |
| palignr $9, %xmm4, %xmm5 |
| test %rax, %rax |
| palignr $9, %xmm3, %xmm4 |
| jnz L(Shl9Start) |
| # ifdef USE_AS_STRNCPY |
| sub $64, %r8 |
| jbe L(StrncpyLeave9) |
| # endif |
| palignr $9, %xmm2, %xmm3 |
| lea 64(%rcx), %rcx |
| palignr $9, %xmm1, %xmm2 |
| movaps %xmm7, %xmm1 |
| movaps %xmm5, 48(%rdx) |
| movaps %xmm4, 32(%rdx) |
| movaps %xmm3, 16(%rdx) |
| movaps %xmm2, (%rdx) |
| lea 64(%rdx), %rdx |
| jmp L(Shl9LoopStart) |
| |
| L(Shl9LoopExit): |
| mov -1(%rcx), %r9 |
| mov $7, %rsi |
| mov %r9, -1(%rdx) |
| jmp L(CopyFrom1To16Bytes) |
| |
| .p2align 4 |
| L(Shl10): |
| movaps -10(%rcx), %xmm1 |
| movaps 6(%rcx), %xmm2 |
| L(Shl10Start): |
| pcmpeqb %xmm2, %xmm0 |
| pmovmskb %xmm0, %rax |
| movaps %xmm2, %xmm3 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit10Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl10LoopExit) |
| |
| palignr $10, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 22(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| movaps %xmm2, %xmm1 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit10Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl10LoopExit) |
| |
| palignr $10, %xmm3, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 22(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| movaps %xmm2, %xmm3 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit10Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl10LoopExit) |
| |
| palignr $10, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 22(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit10Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl10LoopExit) |
| |
| palignr $10, %xmm3, %xmm2 |
| movaps %xmm2, (%rdx) |
| lea 22(%rcx), %rcx |
| lea 16(%rdx), %rdx |
| |
| mov %rcx, %rax |
| and $-0x40, %rcx |
| sub %rcx, %rax |
| lea -6(%rcx), %rcx |
| sub %rax, %rdx |
| # ifdef USE_AS_STRNCPY |
| add %rax, %r8 |
| # endif |
| movaps -10(%rcx), %xmm1 |
| |
| /* 64 bytes loop */ |
| .p2align 4 |
| L(Shl10LoopStart): |
| movaps 6(%rcx), %xmm2 |
| movaps 22(%rcx), %xmm3 |
| movaps %xmm3, %xmm6 |
| movaps 38(%rcx), %xmm4 |
| movaps %xmm4, %xmm7 |
| movaps 54(%rcx), %xmm5 |
| pminub %xmm2, %xmm6 |
| pminub %xmm5, %xmm7 |
| pminub %xmm6, %xmm7 |
| pcmpeqb %xmm0, %xmm7 |
| pmovmskb %xmm7, %rax |
| movaps %xmm5, %xmm7 |
| palignr $10, %xmm4, %xmm5 |
| test %rax, %rax |
| palignr $10, %xmm3, %xmm4 |
| jnz L(Shl10Start) |
| # ifdef USE_AS_STRNCPY |
| sub $64, %r8 |
| jbe L(StrncpyLeave10) |
| # endif |
| palignr $10, %xmm2, %xmm3 |
| lea 64(%rcx), %rcx |
| palignr $10, %xmm1, %xmm2 |
| movaps %xmm7, %xmm1 |
| movaps %xmm5, 48(%rdx) |
| movaps %xmm4, 32(%rdx) |
| movaps %xmm3, 16(%rdx) |
| movaps %xmm2, (%rdx) |
| lea 64(%rdx), %rdx |
| jmp L(Shl10LoopStart) |
| |
| L(Shl10LoopExit): |
| mov -2(%rcx), %r9 |
| mov $6, %rsi |
| mov %r9, -2(%rdx) |
| jmp L(CopyFrom1To16Bytes) |
| |
| .p2align 4 |
| L(Shl11): |
| movaps -11(%rcx), %xmm1 |
| movaps 5(%rcx), %xmm2 |
| L(Shl11Start): |
| pcmpeqb %xmm2, %xmm0 |
| pmovmskb %xmm0, %rax |
| movaps %xmm2, %xmm3 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit11Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl11LoopExit) |
| |
| palignr $11, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 21(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| movaps %xmm2, %xmm1 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit11Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl11LoopExit) |
| |
| palignr $11, %xmm3, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 21(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| movaps %xmm2, %xmm3 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit11Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl11LoopExit) |
| |
| palignr $11, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 21(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit11Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl11LoopExit) |
| |
| palignr $11, %xmm3, %xmm2 |
| movaps %xmm2, (%rdx) |
| lea 21(%rcx), %rcx |
| lea 16(%rdx), %rdx |
| |
| mov %rcx, %rax |
| and $-0x40, %rcx |
| sub %rcx, %rax |
| lea -5(%rcx), %rcx |
| sub %rax, %rdx |
| # ifdef USE_AS_STRNCPY |
| add %rax, %r8 |
| # endif |
| movaps -11(%rcx), %xmm1 |
| |
| /* 64 bytes loop */ |
| .p2align 4 |
| L(Shl11LoopStart): |
| movaps 5(%rcx), %xmm2 |
| movaps 21(%rcx), %xmm3 |
| movaps %xmm3, %xmm6 |
| movaps 37(%rcx), %xmm4 |
| movaps %xmm4, %xmm7 |
| movaps 53(%rcx), %xmm5 |
| pminub %xmm2, %xmm6 |
| pminub %xmm5, %xmm7 |
| pminub %xmm6, %xmm7 |
| pcmpeqb %xmm0, %xmm7 |
| pmovmskb %xmm7, %rax |
| movaps %xmm5, %xmm7 |
| palignr $11, %xmm4, %xmm5 |
| test %rax, %rax |
| palignr $11, %xmm3, %xmm4 |
| jnz L(Shl11Start) |
| # ifdef USE_AS_STRNCPY |
| sub $64, %r8 |
| jbe L(StrncpyLeave11) |
| # endif |
| palignr $11, %xmm2, %xmm3 |
| lea 64(%rcx), %rcx |
| palignr $11, %xmm1, %xmm2 |
| movaps %xmm7, %xmm1 |
| movaps %xmm5, 48(%rdx) |
| movaps %xmm4, 32(%rdx) |
| movaps %xmm3, 16(%rdx) |
| movaps %xmm2, (%rdx) |
| lea 64(%rdx), %rdx |
| jmp L(Shl11LoopStart) |
| |
| L(Shl11LoopExit): |
| mov -3(%rcx), %r9 |
| mov $5, %rsi |
| mov %r9, -3(%rdx) |
| jmp L(CopyFrom1To16Bytes) |
| |
| .p2align 4 |
| L(Shl12): |
| movaps -12(%rcx), %xmm1 |
| movaps 4(%rcx), %xmm2 |
| L(Shl12Start): |
| pcmpeqb %xmm2, %xmm0 |
| pmovmskb %xmm0, %rax |
| movaps %xmm2, %xmm3 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit12Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl12LoopExit) |
| |
| palignr $12, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 20(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| movaps %xmm2, %xmm1 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit12Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl12LoopExit) |
| |
| palignr $12, %xmm3, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 20(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| movaps %xmm2, %xmm3 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit12Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl12LoopExit) |
| |
| palignr $12, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 20(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit12Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl12LoopExit) |
| |
| palignr $12, %xmm3, %xmm2 |
| movaps %xmm2, (%rdx) |
| lea 20(%rcx), %rcx |
| lea 16(%rdx), %rdx |
| |
| mov %rcx, %rax |
| and $-0x40, %rcx |
| sub %rcx, %rax |
| lea -4(%rcx), %rcx |
| sub %rax, %rdx |
| # ifdef USE_AS_STRNCPY |
| add %rax, %r8 |
| # endif |
| movaps -12(%rcx), %xmm1 |
| |
| /* 64 bytes loop */ |
| .p2align 4 |
| L(Shl12LoopStart): |
| movaps 4(%rcx), %xmm2 |
| movaps 20(%rcx), %xmm3 |
| movaps %xmm3, %xmm6 |
| movaps 36(%rcx), %xmm4 |
| movaps %xmm4, %xmm7 |
| movaps 52(%rcx), %xmm5 |
| pminub %xmm2, %xmm6 |
| pminub %xmm5, %xmm7 |
| pminub %xmm6, %xmm7 |
| pcmpeqb %xmm0, %xmm7 |
| pmovmskb %xmm7, %rax |
| movaps %xmm5, %xmm7 |
| palignr $12, %xmm4, %xmm5 |
| test %rax, %rax |
| palignr $12, %xmm3, %xmm4 |
| jnz L(Shl12Start) |
| # ifdef USE_AS_STRNCPY |
| sub $64, %r8 |
| jbe L(StrncpyLeave12) |
| # endif |
| palignr $12, %xmm2, %xmm3 |
| lea 64(%rcx), %rcx |
| palignr $12, %xmm1, %xmm2 |
| movaps %xmm7, %xmm1 |
| movaps %xmm5, 48(%rdx) |
| movaps %xmm4, 32(%rdx) |
| movaps %xmm3, 16(%rdx) |
| movaps %xmm2, (%rdx) |
| lea 64(%rdx), %rdx |
| jmp L(Shl12LoopStart) |
| |
| L(Shl12LoopExit): |
| mov (%rcx), %r9d |
| mov $4, %rsi |
| mov %r9d, (%rdx) |
| jmp L(CopyFrom1To16Bytes) |
| |
| .p2align 4 |
| L(Shl13): |
| movaps -13(%rcx), %xmm1 |
| movaps 3(%rcx), %xmm2 |
| L(Shl13Start): |
| pcmpeqb %xmm2, %xmm0 |
| pmovmskb %xmm0, %rax |
| movaps %xmm2, %xmm3 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit13Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl13LoopExit) |
| |
| palignr $13, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 19(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| movaps %xmm2, %xmm1 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit13Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl13LoopExit) |
| |
| palignr $13, %xmm3, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 19(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| movaps %xmm2, %xmm3 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit13Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl13LoopExit) |
| |
| palignr $13, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 19(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit13Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl13LoopExit) |
| |
| palignr $13, %xmm3, %xmm2 |
| movaps %xmm2, (%rdx) |
| lea 19(%rcx), %rcx |
| lea 16(%rdx), %rdx |
| |
| mov %rcx, %rax |
| and $-0x40, %rcx |
| sub %rcx, %rax |
| lea -3(%rcx), %rcx |
| sub %rax, %rdx |
| # ifdef USE_AS_STRNCPY |
| add %rax, %r8 |
| # endif |
| movaps -13(%rcx), %xmm1 |
| |
| /* 64 bytes loop */ |
| .p2align 4 |
| L(Shl13LoopStart): |
| movaps 3(%rcx), %xmm2 |
| movaps 19(%rcx), %xmm3 |
| movaps %xmm3, %xmm6 |
| movaps 35(%rcx), %xmm4 |
| movaps %xmm4, %xmm7 |
| movaps 51(%rcx), %xmm5 |
| pminub %xmm2, %xmm6 |
| pminub %xmm5, %xmm7 |
| pminub %xmm6, %xmm7 |
| pcmpeqb %xmm0, %xmm7 |
| pmovmskb %xmm7, %rax |
| movaps %xmm5, %xmm7 |
| palignr $13, %xmm4, %xmm5 |
| test %rax, %rax |
| palignr $13, %xmm3, %xmm4 |
| jnz L(Shl13Start) |
| # ifdef USE_AS_STRNCPY |
| sub $64, %r8 |
| jbe L(StrncpyLeave13) |
| # endif |
| palignr $13, %xmm2, %xmm3 |
| lea 64(%rcx), %rcx |
| palignr $13, %xmm1, %xmm2 |
| movaps %xmm7, %xmm1 |
| movaps %xmm5, 48(%rdx) |
| movaps %xmm4, 32(%rdx) |
| movaps %xmm3, 16(%rdx) |
| movaps %xmm2, (%rdx) |
| lea 64(%rdx), %rdx |
| jmp L(Shl13LoopStart) |
| |
| L(Shl13LoopExit): |
| mov -1(%rcx), %r9d |
| mov $3, %rsi |
| mov %r9d, -1(%rdx) |
| jmp L(CopyFrom1To16Bytes) |
| |
| .p2align 4 |
| L(Shl14): |
| movaps -14(%rcx), %xmm1 |
| movaps 2(%rcx), %xmm2 |
| L(Shl14Start): |
| pcmpeqb %xmm2, %xmm0 |
| pmovmskb %xmm0, %rax |
| movaps %xmm2, %xmm3 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit14Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl14LoopExit) |
| |
| palignr $14, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 18(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| movaps %xmm2, %xmm1 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit14Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl14LoopExit) |
| |
| palignr $14, %xmm3, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 18(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| movaps %xmm2, %xmm3 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit14Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl14LoopExit) |
| |
| palignr $14, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 18(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit14Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl14LoopExit) |
| |
| palignr $14, %xmm3, %xmm2 |
| movaps %xmm2, (%rdx) |
| lea 18(%rcx), %rcx |
| lea 16(%rdx), %rdx |
| |
| mov %rcx, %rax |
| and $-0x40, %rcx |
| sub %rcx, %rax |
| lea -2(%rcx), %rcx |
| sub %rax, %rdx |
| # ifdef USE_AS_STRNCPY |
| add %rax, %r8 |
| # endif |
| movaps -14(%rcx), %xmm1 |
| |
| /* 64 bytes loop */ |
| .p2align 4 |
| L(Shl14LoopStart): |
| movaps 2(%rcx), %xmm2 |
| movaps 18(%rcx), %xmm3 |
| movaps %xmm3, %xmm6 |
| movaps 34(%rcx), %xmm4 |
| movaps %xmm4, %xmm7 |
| movaps 50(%rcx), %xmm5 |
| pminub %xmm2, %xmm6 |
| pminub %xmm5, %xmm7 |
| pminub %xmm6, %xmm7 |
| pcmpeqb %xmm0, %xmm7 |
| pmovmskb %xmm7, %rax |
| movaps %xmm5, %xmm7 |
| palignr $14, %xmm4, %xmm5 |
| test %rax, %rax |
| palignr $14, %xmm3, %xmm4 |
| jnz L(Shl14Start) |
| # ifdef USE_AS_STRNCPY |
| sub $64, %r8 |
| jbe L(StrncpyLeave14) |
| # endif |
| palignr $14, %xmm2, %xmm3 |
| lea 64(%rcx), %rcx |
| palignr $14, %xmm1, %xmm2 |
| movaps %xmm7, %xmm1 |
| movaps %xmm5, 48(%rdx) |
| movaps %xmm4, 32(%rdx) |
| movaps %xmm3, 16(%rdx) |
| movaps %xmm2, (%rdx) |
| lea 64(%rdx), %rdx |
| jmp L(Shl14LoopStart) |
| |
| L(Shl14LoopExit): |
| mov -2(%rcx), %r9d |
| mov $2, %rsi |
| mov %r9d, -2(%rdx) |
| jmp L(CopyFrom1To16Bytes) |
| |
| .p2align 4 |
| L(Shl15): |
| movaps -15(%rcx), %xmm1 |
| movaps 1(%rcx), %xmm2 |
| L(Shl15Start): |
| pcmpeqb %xmm2, %xmm0 |
| pmovmskb %xmm0, %rax |
| movaps %xmm2, %xmm3 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit15Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl15LoopExit) |
| |
| palignr $15, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 17(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| movaps %xmm2, %xmm1 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit15Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl15LoopExit) |
| |
| palignr $15, %xmm3, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 17(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| movaps %xmm2, %xmm3 |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit15Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl15LoopExit) |
| |
| palignr $15, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 17(%rcx), %xmm2 |
| |
| pcmpeqb %xmm2, %xmm0 |
| lea 16(%rdx), %rdx |
| pmovmskb %xmm0, %rax |
| lea 16(%rcx), %rcx |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| jbe L(StrncpyExit15Case2OrCase3) |
| # endif |
| test %rax, %rax |
| jnz L(Shl15LoopExit) |
| |
| palignr $15, %xmm3, %xmm2 |
| movaps %xmm2, (%rdx) |
| lea 17(%rcx), %rcx |
| lea 16(%rdx), %rdx |
| |
| mov %rcx, %rax |
| and $-0x40, %rcx |
| sub %rcx, %rax |
| lea -1(%rcx), %rcx |
| sub %rax, %rdx |
| # ifdef USE_AS_STRNCPY |
| add %rax, %r8 |
| # endif |
| movaps -15(%rcx), %xmm1 |
| |
| /* 64 bytes loop */ |
| .p2align 4 |
| L(Shl15LoopStart): |
| movaps 1(%rcx), %xmm2 |
| movaps 17(%rcx), %xmm3 |
| movaps %xmm3, %xmm6 |
| movaps 33(%rcx), %xmm4 |
| movaps %xmm4, %xmm7 |
| movaps 49(%rcx), %xmm5 |
| pminub %xmm2, %xmm6 |
| pminub %xmm5, %xmm7 |
| pminub %xmm6, %xmm7 |
| pcmpeqb %xmm0, %xmm7 |
| pmovmskb %xmm7, %rax |
| movaps %xmm5, %xmm7 |
| palignr $15, %xmm4, %xmm5 |
| test %rax, %rax |
| palignr $15, %xmm3, %xmm4 |
| jnz L(Shl15Start) |
| # ifdef USE_AS_STRNCPY |
| sub $64, %r8 |
| jbe L(StrncpyLeave15) |
| # endif |
| palignr $15, %xmm2, %xmm3 |
| lea 64(%rcx), %rcx |
| palignr $15, %xmm1, %xmm2 |
| movaps %xmm7, %xmm1 |
| movaps %xmm5, 48(%rdx) |
| movaps %xmm4, 32(%rdx) |
| movaps %xmm3, 16(%rdx) |
| movaps %xmm2, (%rdx) |
| lea 64(%rdx), %rdx |
| jmp L(Shl15LoopStart) |
| |
| L(Shl15LoopExit): |
| mov -3(%rcx), %r9d |
| mov $1, %rsi |
| mov %r9d, -3(%rdx) |
| # ifdef USE_AS_STRCAT |
| jmp L(CopyFrom1To16Bytes) |
| # endif |
| |
| # ifndef USE_AS_STRCAT |
| |
| .p2align 4 |
| L(CopyFrom1To16Bytes): |
| # ifdef USE_AS_STRNCPY |
| add $16, %r8 |
| # endif |
| add %rsi, %rdx |
| add %rsi, %rcx |
| |
| test %al, %al |
| jz L(ExitHigh) |
| test $0x01, %al |
| jnz L(Exit1) |
| test $0x02, %al |
| jnz L(Exit2) |
| test $0x04, %al |
| jnz L(Exit3) |
| test $0x08, %al |
| jnz L(Exit4) |
| test $0x10, %al |
| jnz L(Exit5) |
| test $0x20, %al |
| jnz L(Exit6) |
| test $0x40, %al |
| jnz L(Exit7) |
| |
| .p2align 4 |
| L(Exit8): |
| mov (%rcx), %rax |
| mov %rax, (%rdx) |
| # ifdef USE_AS_STPCPY |
| lea 7(%rdx), %rax |
| # else |
| mov %rdi, %rax |
| # endif |
| # ifdef USE_AS_STRNCPY |
| sub $8, %r8 |
| lea 8(%rdx), %rcx |
| jnz L(StrncpyFillTailWithZero1) |
| # ifdef USE_AS_STPCPY |
| cmpb $1, (%rax) |
| sbb $-1, %rax |
| # endif |
| # endif |
| ret |
| |
| .p2align 4 |
| L(ExitHigh): |
| test $0x01, %ah |
| jnz L(Exit9) |
| test $0x02, %ah |
| jnz L(Exit10) |
| test $0x04, %ah |
| jnz L(Exit11) |
| test $0x08, %ah |
| jnz L(Exit12) |
| test $0x10, %ah |
| jnz L(Exit13) |
| test $0x20, %ah |
| jnz L(Exit14) |
| test $0x40, %ah |
| jnz L(Exit15) |
| |
| .p2align 4 |
| L(Exit16): |
| mov (%rcx), %rax |
| mov %rax, (%rdx) |
| mov 8(%rcx), %rax |
| mov %rax, 8(%rdx) |
| # ifdef USE_AS_STPCPY |
| lea 15(%rdx), %rax |
| # else |
| mov %rdi, %rax |
| # endif |
| # ifdef USE_AS_STRNCPY |
| sub $16, %r8 |
| lea 16(%rdx), %rcx |
| jnz L(StrncpyFillTailWithZero1) |
| # ifdef USE_AS_STPCPY |
| cmpb $1, (%rax) |
| sbb $-1, %rax |
| # endif |
| # endif |
| ret |
| |
| # ifdef USE_AS_STRNCPY |
| |
| .p2align 4 |
| L(CopyFrom1To16BytesCase2): |
| add $16, %r8 |
| add %rsi, %rcx |
| lea (%rsi, %rdx), %rsi |
| lea -9(%r8), %rdx |
| and $1<<7, %dh |
| or %al, %dh |
| test %dh, %dh |
| lea (%rsi), %rdx |
| jz L(ExitHighCase2) |
| |
| cmp $1, %r8 |
| je L(Exit1) |
| test $0x01, %al |
| jnz L(Exit1) |
| cmp $2, %r8 |
| je L(Exit2) |
| test $0x02, %al |
| jnz L(Exit2) |
| cmp $3, %r8 |
| je L(Exit3) |
| test $0x04, %al |
| jnz L(Exit3) |
| cmp $4, %r8 |
| je L(Exit4) |
| test $0x08, %al |
| jnz L(Exit4) |
| cmp $5, %r8 |
| je L(Exit5) |
| test $0x10, %al |
| jnz L(Exit5) |
| cmp $6, %r8 |
| je L(Exit6) |
| test $0x20, %al |
| jnz L(Exit6) |
| cmp $7, %r8 |
| je L(Exit7) |
| test $0x40, %al |
| jnz L(Exit7) |
| jmp L(Exit8) |
| |
| .p2align 4 |
| L(ExitHighCase2): |
| cmp $9, %r8 |
| je L(Exit9) |
| test $0x01, %ah |
| jnz L(Exit9) |
| cmp $10, %r8 |
| je L(Exit10) |
| test $0x02, %ah |
| jnz L(Exit10) |
| cmp $11, %r8 |
| je L(Exit11) |
| test $0x04, %ah |
| jnz L(Exit11) |
| cmp $12, %r8 |
| je L(Exit12) |
| test $0x8, %ah |
| jnz L(Exit12) |
| cmp $13, %r8 |
| je L(Exit13) |
| test $0x10, %ah |
| jnz L(Exit13) |
| cmp $14, %r8 |
| je L(Exit14) |
| test $0x20, %ah |
| jnz L(Exit14) |
| cmp $15, %r8 |
| je L(Exit15) |
| test $0x40, %ah |
| jnz L(Exit15) |
| jmp L(Exit16) |
| |
| L(CopyFrom1To16BytesCase2OrCase3): |
| test %rax, %rax |
| jnz L(CopyFrom1To16BytesCase2) |
| |
| .p2align 4 |
| L(CopyFrom1To16BytesCase3): |
| add $16, %r8 |
| add %rsi, %rdx |
| add %rsi, %rcx |
| |
| cmp $16, %r8 |
| je L(Exit16) |
| cmp $8, %r8 |
| je L(Exit8) |
| jg L(More8Case3) |
| cmp $4, %r8 |
| je L(Exit4) |
| jg L(More4Case3) |
| cmp $2, %r8 |
| jl L(Exit1) |
| je L(Exit2) |
| jg L(Exit3) |
| L(More8Case3): /* but less than 16 */ |
| cmp $12, %r8 |
| je L(Exit12) |
| jl L(Less12Case3) |
| cmp $14, %r8 |
| jl L(Exit13) |
| je L(Exit14) |
| jg L(Exit15) |
| L(More4Case3): /* but less than 8 */ |
| cmp $6, %r8 |
| jl L(Exit5) |
| je L(Exit6) |
| jg L(Exit7) |
| L(Less12Case3): /* but more than 8 */ |
| cmp $10, %r8 |
| jl L(Exit9) |
| je L(Exit10) |
| jg L(Exit11) |
| # endif |
| |
| .p2align 4 |
| L(Exit1): |
| movb (%rcx), %al |
| movb %al, (%rdx) |
| # ifdef USE_AS_STPCPY |
| lea (%rdx), %rax |
| # else |
| mov %rdi, %rax |
| # endif |
| # ifdef USE_AS_STRNCPY |
| sub $1, %r8 |
| lea 1(%rdx), %rcx |
| jnz L(StrncpyFillTailWithZero1) |
| # ifdef USE_AS_STPCPY |
| cmpb $1, (%rax) |
| sbb $-1, %rax |
| # endif |
| # endif |
| ret |
| |
| .p2align 4 |
| L(Exit2): |
| movw (%rcx), %ax |
| movw %ax, (%rdx) |
| # ifdef USE_AS_STPCPY |
| lea 1(%rdx), %rax |
| # else |
| mov %rdi, %rax |
| # endif |
| # ifdef USE_AS_STRNCPY |
| sub $2, %r8 |
| lea 2(%rdx), %rcx |
| jnz L(StrncpyFillTailWithZero1) |
| # ifdef USE_AS_STPCPY |
| cmpb $1, (%rax) |
| sbb $-1, %rax |
| # endif |
| # endif |
| ret |
| |
| .p2align 4 |
| L(Exit3): |
| movw (%rcx), %ax |
| movw %ax, (%rdx) |
| movb 2(%rcx), %al |
| movb %al, 2(%rdx) |
| # ifdef USE_AS_STPCPY |
| lea 2(%rdx), %rax |
| # else |
| mov %rdi, %rax |
| # endif |
| # ifdef USE_AS_STRNCPY |
| sub $3, %r8 |
| lea 3(%rdx), %rcx |
| jnz L(StrncpyFillTailWithZero1) |
| # ifdef USE_AS_STPCPY |
| cmpb $1, (%rax) |
| sbb $-1, %rax |
| # endif |
| # endif |
| ret |
| |
| .p2align 4 |
| L(Exit4): |
| movl (%rcx), %eax |
| movl %eax, (%rdx) |
| # ifdef USE_AS_STPCPY |
| lea 3(%rdx), %rax |
| # else |
| mov %rdi, %rax |
| # endif |
| # ifdef USE_AS_STRNCPY |
| sub $4, %r8 |
| lea 4(%rdx), %rcx |
| jnz L(StrncpyFillTailWithZero1) |
| # ifdef USE_AS_STPCPY |
| cmpb $1, (%rax) |
| sbb $-1, %rax |
| # endif |
| # endif |
| ret |
| |
| .p2align 4 |
| L(Exit5): |
| movl (%rcx), %eax |
| movl %eax, (%rdx) |
| movb 4(%rcx), %al |
| movb %al, 4(%rdx) |
| # ifdef USE_AS_STPCPY |
| lea 4(%rdx), %rax |
| # else |
| mov %rdi, %rax |
| # endif |
| # ifdef USE_AS_STRNCPY |
| sub $5, %r8 |
| lea 5(%rdx), %rcx |
| jnz L(StrncpyFillTailWithZero1) |
| # ifdef USE_AS_STPCPY |
| cmpb $1, (%rax) |
| sbb $-1, %rax |
| # endif |
| # endif |
| ret |
| |
| .p2align 4 |
| L(Exit6): |
| movl (%rcx), %eax |
| movl %eax, (%rdx) |
| movw 4(%rcx), %ax |
| movw %ax, 4(%rdx) |
| # ifdef USE_AS_STPCPY |
| lea 5(%rdx), %rax |
| # else |
| mov %rdi, %rax |
| # endif |
| # ifdef USE_AS_STRNCPY |
| sub $6, %r8 |
| lea 6(%rdx), %rcx |
| jnz L(StrncpyFillTailWithZero1) |
| # ifdef USE_AS_STPCPY |
| cmpb $1, (%rax) |
| sbb $-1, %rax |
| # endif |
| # endif |
| ret |
| |
| .p2align 4 |
| L(Exit7): |
| movl (%rcx), %eax |
| movl %eax, (%rdx) |
| movl 3(%rcx), %eax |
| movl %eax, 3(%rdx) |
| # ifdef USE_AS_STPCPY |
| lea 6(%rdx), %rax |
| # else |
| mov %rdi, %rax |
| # endif |
| # ifdef USE_AS_STRNCPY |
| sub $7, %r8 |
| lea 7(%rdx), %rcx |
| jnz L(StrncpyFillTailWithZero1) |
| # ifdef USE_AS_STPCPY |
| cmpb $1, (%rax) |
| sbb $-1, %rax |
| # endif |
| # endif |
| ret |
| |
| .p2align 4 |
| L(Exit9): |
| mov (%rcx), %rax |
| mov %rax, (%rdx) |
| mov 5(%rcx), %eax |
| mov %eax, 5(%rdx) |
| # ifdef USE_AS_STPCPY |
| lea 8(%rdx), %rax |
| # else |
| mov %rdi, %rax |
| # endif |
| # ifdef USE_AS_STRNCPY |
| sub $9, %r8 |
| lea 9(%rdx), %rcx |
| jnz L(StrncpyFillTailWithZero1) |
| # ifdef USE_AS_STPCPY |
| cmpb $1, (%rax) |
| sbb $-1, %rax |
| # endif |
| # endif |
| ret |
| |
| .p2align 4 |
| L(Exit10): |
| mov (%rcx), %rax |
| mov %rax, (%rdx) |
| mov 6(%rcx), %eax |
| mov %eax, 6(%rdx) |
| # ifdef USE_AS_STPCPY |
| lea 9(%rdx), %rax |
| # else |
| mov %rdi, %rax |
| # endif |
| # ifdef USE_AS_STRNCPY |
| sub $10, %r8 |
| lea 10(%rdx), %rcx |
| jnz L(StrncpyFillTailWithZero1) |
| # ifdef USE_AS_STPCPY |
| cmpb $1, (%rax) |
| sbb $-1, %rax |
| # endif |
| # endif |
| ret |
| |
| .p2align 4 |
| L(Exit11): |
| mov (%rcx), %rax |
| mov %rax, (%rdx) |
| mov 7(%rcx), %eax |
| mov %eax, 7(%rdx) |
| # ifdef USE_AS_STPCPY |
| lea 10(%rdx), %rax |
| # else |
| mov %rdi, %rax |
| # endif |
| # ifdef USE_AS_STRNCPY |
| sub $11, %r8 |
| lea 11(%rdx), %rcx |
| jnz L(StrncpyFillTailWithZero1) |
| # ifdef USE_AS_STPCPY |
| cmpb $1, (%rax) |
| sbb $-1, %rax |
| # endif |
| # endif |
| ret |
| |
| .p2align 4 |
| L(Exit12): |
| mov (%rcx), %rax |
| mov %rax, (%rdx) |
| mov 8(%rcx), %eax |
| mov %eax, 8(%rdx) |
| # ifdef USE_AS_STPCPY |
| lea 11(%rdx), %rax |
| # else |
| mov %rdi, %rax |
| # endif |
| # ifdef USE_AS_STRNCPY |
| sub $12, %r8 |
| lea 12(%rdx), %rcx |
| jnz L(StrncpyFillTailWithZero1) |
| # ifdef USE_AS_STPCPY |
| cmpb $1, (%rax) |
| sbb $-1, %rax |
| # endif |
| # endif |
| ret |
| |
| .p2align 4 |
| L(Exit13): |
| mov (%rcx), %rax |
| mov %rax, (%rdx) |
| mov 5(%rcx), %rax |
| mov %rax, 5(%rdx) |
| # ifdef USE_AS_STPCPY |
| lea 12(%rdx), %rax |
| # else |
| mov %rdi, %rax |
| # endif |
| # ifdef USE_AS_STRNCPY |
| sub $13, %r8 |
| lea 13(%rdx), %rcx |
| jnz L(StrncpyFillTailWithZero1) |
| # ifdef USE_AS_STPCPY |
| cmpb $1, (%rax) |
| sbb $-1, %rax |
| # endif |
| # endif |
| ret |
| |
| .p2align 4 |
| L(Exit14): |
| mov (%rcx), %rax |
| mov %rax, (%rdx) |
| mov 6(%rcx), %rax |
| mov %rax, 6(%rdx) |
| # ifdef USE_AS_STPCPY |
| lea 13(%rdx), %rax |
| # else |
| mov %rdi, %rax |
| # endif |
| # ifdef USE_AS_STRNCPY |
| sub $14, %r8 |
| lea 14(%rdx), %rcx |
| jnz L(StrncpyFillTailWithZero1) |
| # ifdef USE_AS_STPCPY |
| cmpb $1, (%rax) |
| sbb $-1, %rax |
| # endif |
| # endif |
| ret |
| |
| .p2align 4 |
| L(Exit15): |
| mov (%rcx), %rax |
| mov %rax, (%rdx) |
| mov 7(%rcx), %rax |
| mov %rax, 7(%rdx) |
| # ifdef USE_AS_STPCPY |
| lea 14(%rdx), %rax |
| # else |
| mov %rdi, %rax |
| # endif |
| # ifdef USE_AS_STRNCPY |
| sub $15, %r8 |
| lea 15(%rdx), %rcx |
| jnz L(StrncpyFillTailWithZero1) |
| # ifdef USE_AS_STPCPY |
| cmpb $1, (%rax) |
| sbb $-1, %rax |
| # endif |
| # endif |
| ret |
| |
| # ifdef USE_AS_STRNCPY |
| .p2align 4 |
| L(Fill0): |
| ret |
| |
| .p2align 4 |
| L(Fill1): |
| movb %dl, (%rcx) |
| ret |
| |
| .p2align 4 |
| L(Fill2): |
| movw %dx, (%rcx) |
| ret |
| |
| .p2align 4 |
| L(Fill3): |
| movw %dx, (%rcx) |
| movb %dl, 2(%rcx) |
| ret |
| |
| .p2align 4 |
| L(Fill4): |
| movl %edx, (%rcx) |
| ret |
| |
| .p2align 4 |
| L(Fill5): |
| movl %edx, (%rcx) |
| movb %dl, 4(%rcx) |
| ret |
| |
| .p2align 4 |
| L(Fill6): |
| movl %edx, (%rcx) |
| movw %dx, 4(%rcx) |
| ret |
| |
| .p2align 4 |
| L(Fill7): |
| movl %edx, (%rcx) |
| movl %edx, 3(%rcx) |
| ret |
| |
| .p2align 4 |
| L(Fill8): |
| mov %rdx, (%rcx) |
| ret |
| |
| .p2align 4 |
| L(Fill9): |
| mov %rdx, (%rcx) |
| movb %dl, 8(%rcx) |
| ret |
| |
| .p2align 4 |
| L(Fill10): |
| mov %rdx, (%rcx) |
| movw %dx, 8(%rcx) |
| ret |
| |
| .p2align 4 |
| L(Fill11): |
| mov %rdx, (%rcx) |
| movl %edx, 7(%rcx) |
| ret |
| |
| .p2align 4 |
| L(Fill12): |
| mov %rdx, (%rcx) |
| movl %edx, 8(%rcx) |
| ret |
| |
| .p2align 4 |
| L(Fill13): |
| mov %rdx, (%rcx) |
| mov %rdx, 5(%rcx) |
| ret |
| |
| .p2align 4 |
| L(Fill14): |
| mov %rdx, (%rcx) |
| mov %rdx, 6(%rcx) |
| ret |
| |
| .p2align 4 |
| L(Fill15): |
| mov %rdx, (%rcx) |
| mov %rdx, 7(%rcx) |
| ret |
| |
| .p2align 4 |
| L(Fill16): |
| mov %rdx, (%rcx) |
| mov %rdx, 8(%rcx) |
| ret |
| |
| .p2align 4 |
| L(StrncpyFillExit1): |
| lea 16(%r8), %r8 |
| L(FillFrom1To16Bytes): |
| test %r8, %r8 |
| jz L(Fill0) |
| cmp $16, %r8 |
| je L(Fill16) |
| cmp $8, %r8 |
| je L(Fill8) |
| jg L(FillMore8) |
| cmp $4, %r8 |
| je L(Fill4) |
| jg L(FillMore4) |
| cmp $2, %r8 |
| jl L(Fill1) |
| je L(Fill2) |
| jg L(Fill3) |
| L(FillMore8): /* but less than 16 */ |
| cmp $12, %r8 |
| je L(Fill12) |
| jl L(FillLess12) |
| cmp $14, %r8 |
| jl L(Fill13) |
| je L(Fill14) |
| jg L(Fill15) |
| L(FillMore4): /* but less than 8 */ |
| cmp $6, %r8 |
| jl L(Fill5) |
| je L(Fill6) |
| jg L(Fill7) |
| L(FillLess12): /* but more than 8 */ |
| cmp $10, %r8 |
| jl L(Fill9) |
| je L(Fill10) |
| jmp L(Fill11) |
| |
| .p2align 4 |
| L(StrncpyFillTailWithZero1): |
| xor %rdx, %rdx |
| sub $16, %r8 |
| jbe L(StrncpyFillExit1) |
| |
| pxor %xmm0, %xmm0 |
| mov %rdx, (%rcx) |
| mov %rdx, 8(%rcx) |
| |
| lea 16(%rcx), %rcx |
| |
| mov %rcx, %rdx |
| and $0xf, %rdx |
| sub %rdx, %rcx |
| add %rdx, %r8 |
| xor %rdx, %rdx |
| sub $64, %r8 |
| jb L(StrncpyFillLess64) |
| |
| L(StrncpyFillLoopMovdqa): |
| movdqa %xmm0, (%rcx) |
| movdqa %xmm0, 16(%rcx) |
| movdqa %xmm0, 32(%rcx) |
| movdqa %xmm0, 48(%rcx) |
| lea 64(%rcx), %rcx |
| sub $64, %r8 |
| jae L(StrncpyFillLoopMovdqa) |
| |
| L(StrncpyFillLess64): |
| add $32, %r8 |
| jl L(StrncpyFillLess32) |
| movdqa %xmm0, (%rcx) |
| movdqa %xmm0, 16(%rcx) |
| lea 32(%rcx), %rcx |
| sub $16, %r8 |
| jl L(StrncpyFillExit1) |
| movdqa %xmm0, (%rcx) |
| lea 16(%rcx), %rcx |
| jmp L(FillFrom1To16Bytes) |
| |
| L(StrncpyFillLess32): |
| add $16, %r8 |
| jl L(StrncpyFillExit1) |
| movdqa %xmm0, (%rcx) |
| lea 16(%rcx), %rcx |
| jmp L(FillFrom1To16Bytes) |
| |
| .p2align 4 |
| L(Exit0): |
| mov %rdx, %rax |
| ret |
| |
| .p2align 4 |
| L(StrncpyExit15Bytes): |
| cmp $9, %r8 |
| je L(Exit9) |
| cmpb $0, 8(%rcx) |
| jz L(Exit9) |
| cmp $10, %r8 |
| je L(Exit10) |
| cmpb $0, 9(%rcx) |
| jz L(Exit10) |
| cmp $11, %r8 |
| je L(Exit11) |
| cmpb $0, 10(%rcx) |
| jz L(Exit11) |
| cmp $12, %r8 |
| je L(Exit12) |
| cmpb $0, 11(%rcx) |
| jz L(Exit12) |
| cmp $13, %r8 |
| je L(Exit13) |
| cmpb $0, 12(%rcx) |
| jz L(Exit13) |
| cmp $14, %r8 |
| je L(Exit14) |
| cmpb $0, 13(%rcx) |
| jz L(Exit14) |
| mov (%rcx), %rax |
| mov %rax, (%rdx) |
| mov 7(%rcx), %rax |
| mov %rax, 7(%rdx) |
| # ifdef USE_AS_STPCPY |
| lea 14(%rdx), %rax |
| cmpb $1, (%rax) |
| sbb $-1, %rax |
| # else |
| mov %rdi, %rax |
| # endif |
| ret |
| |
| .p2align 4 |
| L(StrncpyExit8Bytes): |
| cmp $1, %r8 |
| je L(Exit1) |
| cmpb $0, (%rcx) |
| jz L(Exit1) |
| cmp $2, %r8 |
| je L(Exit2) |
| cmpb $0, 1(%rcx) |
| jz L(Exit2) |
| cmp $3, %r8 |
| je L(Exit3) |
| cmpb $0, 2(%rcx) |
| jz L(Exit3) |
| cmp $4, %r8 |
| je L(Exit4) |
| cmpb $0, 3(%rcx) |
| jz L(Exit4) |
| cmp $5, %r8 |
| je L(Exit5) |
| cmpb $0, 4(%rcx) |
| jz L(Exit5) |
| cmp $6, %r8 |
| je L(Exit6) |
| cmpb $0, 5(%rcx) |
| jz L(Exit6) |
| cmp $7, %r8 |
| je L(Exit7) |
| cmpb $0, 6(%rcx) |
| jz L(Exit7) |
| mov (%rcx), %rax |
| mov %rax, (%rdx) |
| # ifdef USE_AS_STPCPY |
| lea 7(%rdx), %rax |
| cmpb $1, (%rax) |
| sbb $-1, %rax |
| # else |
| mov %rdi, %rax |
| # endif |
| ret |
| |
| # endif |
| # endif |
| |
| # ifdef USE_AS_STRNCPY |
| .p2align 4 |
| L(StrncpyLeaveCase2OrCase3): |
| test %rax, %rax |
| jnz L(Aligned64LeaveCase2) |
| |
| L(Aligned64LeaveCase3): |
| lea 64(%r8), %r8 |
| sub $16, %r8 |
| jbe L(CopyFrom1To16BytesCase3) |
| movaps %xmm4, -64(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(CopyFrom1To16BytesCase3) |
| movaps %xmm5, -48(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(CopyFrom1To16BytesCase3) |
| movaps %xmm6, -32(%rdx) |
| lea 16(%rsi), %rsi |
| lea -16(%r8), %r8 |
| jmp L(CopyFrom1To16BytesCase3) |
| |
| L(Aligned64LeaveCase2): |
| pcmpeqb %xmm4, %xmm0 |
| pmovmskb %xmm0, %rax |
| add $48, %r8 |
| jle L(CopyFrom1To16BytesCase2OrCase3) |
| test %rax, %rax |
| jnz L(CopyFrom1To16Bytes) |
| |
| pcmpeqb %xmm5, %xmm0 |
| pmovmskb %xmm0, %rax |
| movaps %xmm4, -64(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(CopyFrom1To16BytesCase2OrCase3) |
| test %rax, %rax |
| jnz L(CopyFrom1To16Bytes) |
| |
| pcmpeqb %xmm6, %xmm0 |
| pmovmskb %xmm0, %rax |
| movaps %xmm5, -48(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(CopyFrom1To16BytesCase2OrCase3) |
| test %rax, %rax |
| jnz L(CopyFrom1To16Bytes) |
| |
| pcmpeqb %xmm7, %xmm0 |
| pmovmskb %xmm0, %rax |
| movaps %xmm6, -32(%rdx) |
| lea 16(%rsi), %rsi |
| lea -16(%r8), %r8 |
| jmp L(CopyFrom1To16BytesCase2) |
| /*--------------------------------------------------*/ |
| .p2align 4 |
| L(StrncpyExit1Case2OrCase3): |
| movdqu -1(%rcx), %xmm0 |
| movdqu %xmm0, -1(%rdx) |
| mov $15, %rsi |
| test %rax, %rax |
| jnz L(CopyFrom1To16BytesCase2) |
| jmp L(CopyFrom1To16BytesCase3) |
| |
| .p2align 4 |
| L(StrncpyExit2Case2OrCase3): |
| movdqu -2(%rcx), %xmm0 |
| movdqu %xmm0, -2(%rdx) |
| mov $14, %rsi |
| test %rax, %rax |
| jnz L(CopyFrom1To16BytesCase2) |
| jmp L(CopyFrom1To16BytesCase3) |
| |
| .p2align 4 |
| L(StrncpyExit3Case2OrCase3): |
| movdqu -3(%rcx), %xmm0 |
| movdqu %xmm0, -3(%rdx) |
| mov $13, %rsi |
| test %rax, %rax |
| jnz L(CopyFrom1To16BytesCase2) |
| jmp L(CopyFrom1To16BytesCase3) |
| |
| .p2align 4 |
| L(StrncpyExit4Case2OrCase3): |
| movdqu -4(%rcx), %xmm0 |
| movdqu %xmm0, -4(%rdx) |
| mov $12, %rsi |
| test %rax, %rax |
| jnz L(CopyFrom1To16BytesCase2) |
| jmp L(CopyFrom1To16BytesCase3) |
| |
| .p2align 4 |
| L(StrncpyExit5Case2OrCase3): |
| movdqu -5(%rcx), %xmm0 |
| movdqu %xmm0, -5(%rdx) |
| mov $11, %rsi |
| test %rax, %rax |
| jnz L(CopyFrom1To16BytesCase2) |
| jmp L(CopyFrom1To16BytesCase3) |
| |
| .p2align 4 |
| L(StrncpyExit6Case2OrCase3): |
| mov (%rcx), %rsi |
| mov 6(%rcx), %r9d |
| mov %r9d, 6(%rdx) |
| mov %rsi, (%rdx) |
| test %rax, %rax |
| mov $10, %rsi |
| jnz L(CopyFrom1To16BytesCase2) |
| jmp L(CopyFrom1To16BytesCase3) |
| |
| .p2align 4 |
| L(StrncpyExit7Case2OrCase3): |
| mov (%rcx), %rsi |
| mov 5(%rcx), %r9d |
| mov %r9d, 5(%rdx) |
| mov %rsi, (%rdx) |
| test %rax, %rax |
| mov $9, %rsi |
| jnz L(CopyFrom1To16BytesCase2) |
| jmp L(CopyFrom1To16BytesCase3) |
| |
| .p2align 4 |
| L(StrncpyExit8Case2OrCase3): |
| mov (%rcx), %r9 |
| mov $8, %rsi |
| mov %r9, (%rdx) |
| test %rax, %rax |
| jnz L(CopyFrom1To16BytesCase2) |
| jmp L(CopyFrom1To16BytesCase3) |
| |
| .p2align 4 |
| L(StrncpyExit9Case2OrCase3): |
| mov -1(%rcx), %r9 |
| mov $7, %rsi |
| mov %r9, -1(%rdx) |
| test %rax, %rax |
| jnz L(CopyFrom1To16BytesCase2) |
| jmp L(CopyFrom1To16BytesCase3) |
| |
| .p2align 4 |
| L(StrncpyExit10Case2OrCase3): |
| mov -2(%rcx), %r9 |
| mov $6, %rsi |
| mov %r9, -2(%rdx) |
| test %rax, %rax |
| jnz L(CopyFrom1To16BytesCase2) |
| jmp L(CopyFrom1To16BytesCase3) |
| |
| .p2align 4 |
| L(StrncpyExit11Case2OrCase3): |
| mov -3(%rcx), %r9 |
| mov $5, %rsi |
| mov %r9, -3(%rdx) |
| test %rax, %rax |
| jnz L(CopyFrom1To16BytesCase2) |
| jmp L(CopyFrom1To16BytesCase3) |
| |
| .p2align 4 |
| L(StrncpyExit12Case2OrCase3): |
| mov (%rcx), %r9d |
| mov $4, %rsi |
| mov %r9d, (%rdx) |
| test %rax, %rax |
| jnz L(CopyFrom1To16BytesCase2) |
| jmp L(CopyFrom1To16BytesCase3) |
| |
| .p2align 4 |
| L(StrncpyExit13Case2OrCase3): |
| mov -1(%rcx), %r9d |
| mov $3, %rsi |
| mov %r9d, -1(%rdx) |
| test %rax, %rax |
| jnz L(CopyFrom1To16BytesCase2) |
| jmp L(CopyFrom1To16BytesCase3) |
| |
| .p2align 4 |
| L(StrncpyExit14Case2OrCase3): |
| mov -2(%rcx), %r9d |
| mov $2, %rsi |
| mov %r9d, -2(%rdx) |
| test %rax, %rax |
| jnz L(CopyFrom1To16BytesCase2) |
| jmp L(CopyFrom1To16BytesCase3) |
| |
| .p2align 4 |
| L(StrncpyExit15Case2OrCase3): |
| mov -3(%rcx), %r9d |
| mov $1, %rsi |
| mov %r9d, -3(%rdx) |
| test %rax, %rax |
| jnz L(CopyFrom1To16BytesCase2) |
| jmp L(CopyFrom1To16BytesCase3) |
| |
| .p2align 4 |
| L(StrncpyLeave1): |
| movaps %xmm2, %xmm3 |
| add $48, %r8 |
| jle L(StrncpyExit1) |
| palignr $1, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 31(%rcx), %xmm2 |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit1) |
| palignr $1, %xmm3, %xmm2 |
| movaps %xmm2, 16(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit1) |
| movaps %xmm4, 32(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit1) |
| movaps %xmm5, 48(%rdx) |
| lea 16(%rsi), %rsi |
| lea -16(%r8), %r8 |
| |
| L(StrncpyExit1): |
| lea 15(%rdx, %rsi), %rdx |
| lea 15(%rcx, %rsi), %rcx |
| mov -15(%rcx), %rsi |
| mov -8(%rcx), %rax |
| mov %rsi, -15(%rdx) |
| mov %rax, -8(%rdx) |
| xor %rsi, %rsi |
| jmp L(CopyFrom1To16BytesCase3) |
| |
| .p2align 4 |
| L(StrncpyLeave2): |
| movaps %xmm2, %xmm3 |
| add $48, %r8 |
| jle L(StrncpyExit2) |
| palignr $2, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 30(%rcx), %xmm2 |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit2) |
| palignr $2, %xmm3, %xmm2 |
| movaps %xmm2, 16(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit2) |
| movaps %xmm4, 32(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit2) |
| movaps %xmm5, 48(%rdx) |
| lea 16(%rsi), %rsi |
| lea -16(%r8), %r8 |
| |
| L(StrncpyExit2): |
| lea 14(%rdx, %rsi), %rdx |
| lea 14(%rcx, %rsi), %rcx |
| mov -14(%rcx), %rsi |
| mov -8(%rcx), %rax |
| mov %rsi, -14(%rdx) |
| mov %rax, -8(%rdx) |
| xor %rsi, %rsi |
| jmp L(CopyFrom1To16BytesCase3) |
| |
| .p2align 4 |
| L(StrncpyLeave3): |
| movaps %xmm2, %xmm3 |
| add $48, %r8 |
| jle L(StrncpyExit3) |
| palignr $3, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 29(%rcx), %xmm2 |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit3) |
| palignr $3, %xmm3, %xmm2 |
| movaps %xmm2, 16(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit3) |
| movaps %xmm4, 32(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit3) |
| movaps %xmm5, 48(%rdx) |
| lea 16(%rsi), %rsi |
| lea -16(%r8), %r8 |
| |
| L(StrncpyExit3): |
| lea 13(%rdx, %rsi), %rdx |
| lea 13(%rcx, %rsi), %rcx |
| mov -13(%rcx), %rsi |
| mov -8(%rcx), %rax |
| mov %rsi, -13(%rdx) |
| mov %rax, -8(%rdx) |
| xor %rsi, %rsi |
| jmp L(CopyFrom1To16BytesCase3) |
| |
| .p2align 4 |
| L(StrncpyLeave4): |
| movaps %xmm2, %xmm3 |
| add $48, %r8 |
| jle L(StrncpyExit4) |
| palignr $4, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 28(%rcx), %xmm2 |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit4) |
| palignr $4, %xmm3, %xmm2 |
| movaps %xmm2, 16(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit4) |
| movaps %xmm4, 32(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit4) |
| movaps %xmm5, 48(%rdx) |
| lea 16(%rsi), %rsi |
| lea -16(%r8), %r8 |
| |
| L(StrncpyExit4): |
| lea 12(%rdx, %rsi), %rdx |
| lea 12(%rcx, %rsi), %rcx |
| mov -12(%rcx), %rsi |
| mov -4(%rcx), %eax |
| mov %rsi, -12(%rdx) |
| mov %eax, -4(%rdx) |
| xor %rsi, %rsi |
| jmp L(CopyFrom1To16BytesCase3) |
| |
| .p2align 4 |
| L(StrncpyLeave5): |
| movaps %xmm2, %xmm3 |
| add $48, %r8 |
| jle L(StrncpyExit5) |
| palignr $5, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 27(%rcx), %xmm2 |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit5) |
| palignr $5, %xmm3, %xmm2 |
| movaps %xmm2, 16(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit5) |
| movaps %xmm4, 32(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit5) |
| movaps %xmm5, 48(%rdx) |
| lea 16(%rsi), %rsi |
| lea -16(%r8), %r8 |
| |
| L(StrncpyExit5): |
| lea 11(%rdx, %rsi), %rdx |
| lea 11(%rcx, %rsi), %rcx |
| mov -11(%rcx), %rsi |
| mov -4(%rcx), %eax |
| mov %rsi, -11(%rdx) |
| mov %eax, -4(%rdx) |
| xor %rsi, %rsi |
| jmp L(CopyFrom1To16BytesCase3) |
| |
| .p2align 4 |
| L(StrncpyLeave6): |
| movaps %xmm2, %xmm3 |
| add $48, %r8 |
| jle L(StrncpyExit6) |
| palignr $6, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 26(%rcx), %xmm2 |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit6) |
| palignr $6, %xmm3, %xmm2 |
| movaps %xmm2, 16(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit6) |
| movaps %xmm4, 32(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit6) |
| movaps %xmm5, 48(%rdx) |
| lea 16(%rsi), %rsi |
| lea -16(%r8), %r8 |
| |
| L(StrncpyExit6): |
| lea 10(%rdx, %rsi), %rdx |
| lea 10(%rcx, %rsi), %rcx |
| mov -10(%rcx), %rsi |
| movw -2(%rcx), %ax |
| mov %rsi, -10(%rdx) |
| movw %ax, -2(%rdx) |
| xor %rsi, %rsi |
| jmp L(CopyFrom1To16BytesCase3) |
| |
| .p2align 4 |
| L(StrncpyLeave7): |
| movaps %xmm2, %xmm3 |
| add $48, %r8 |
| jle L(StrncpyExit7) |
| palignr $7, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 25(%rcx), %xmm2 |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit7) |
| palignr $7, %xmm3, %xmm2 |
| movaps %xmm2, 16(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit7) |
| movaps %xmm4, 32(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit7) |
| movaps %xmm5, 48(%rdx) |
| lea 16(%rsi), %rsi |
| lea -16(%r8), %r8 |
| |
| L(StrncpyExit7): |
| lea 9(%rdx, %rsi), %rdx |
| lea 9(%rcx, %rsi), %rcx |
| mov -9(%rcx), %rsi |
| movb -1(%rcx), %ah |
| mov %rsi, -9(%rdx) |
| movb %ah, -1(%rdx) |
| xor %rsi, %rsi |
| jmp L(CopyFrom1To16BytesCase3) |
| |
| .p2align 4 |
| L(StrncpyLeave8): |
| movaps %xmm2, %xmm3 |
| add $48, %r8 |
| jle L(StrncpyExit8) |
| palignr $8, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 24(%rcx), %xmm2 |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit8) |
| palignr $8, %xmm3, %xmm2 |
| movaps %xmm2, 16(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit8) |
| movaps %xmm4, 32(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit8) |
| movaps %xmm5, 48(%rdx) |
| lea 16(%rsi), %rsi |
| lea -16(%r8), %r8 |
| |
| L(StrncpyExit8): |
| lea 8(%rdx, %rsi), %rdx |
| lea 8(%rcx, %rsi), %rcx |
| mov -8(%rcx), %rax |
| xor %rsi, %rsi |
| mov %rax, -8(%rdx) |
| jmp L(CopyFrom1To16BytesCase3) |
| |
| .p2align 4 |
| L(StrncpyLeave9): |
| movaps %xmm2, %xmm3 |
| add $48, %r8 |
| jle L(StrncpyExit9) |
| palignr $9, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 23(%rcx), %xmm2 |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit9) |
| palignr $9, %xmm3, %xmm2 |
| movaps %xmm2, 16(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit9) |
| movaps %xmm4, 32(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit9) |
| movaps %xmm5, 48(%rdx) |
| lea 16(%rsi), %rsi |
| lea -16(%r8), %r8 |
| |
| L(StrncpyExit9): |
| lea 7(%rdx, %rsi), %rdx |
| lea 7(%rcx, %rsi), %rcx |
| mov -8(%rcx), %rax |
| xor %rsi, %rsi |
| mov %rax, -8(%rdx) |
| jmp L(CopyFrom1To16BytesCase3) |
| |
| .p2align 4 |
| L(StrncpyLeave10): |
| movaps %xmm2, %xmm3 |
| add $48, %r8 |
| jle L(StrncpyExit10) |
| palignr $10, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 22(%rcx), %xmm2 |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit10) |
| palignr $10, %xmm3, %xmm2 |
| movaps %xmm2, 16(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit10) |
| movaps %xmm4, 32(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit10) |
| movaps %xmm5, 48(%rdx) |
| lea 16(%rsi), %rsi |
| lea -16(%r8), %r8 |
| |
| L(StrncpyExit10): |
| lea 6(%rdx, %rsi), %rdx |
| lea 6(%rcx, %rsi), %rcx |
| mov -8(%rcx), %rax |
| xor %rsi, %rsi |
| mov %rax, -8(%rdx) |
| jmp L(CopyFrom1To16BytesCase3) |
| |
| .p2align 4 |
| L(StrncpyLeave11): |
| movaps %xmm2, %xmm3 |
| add $48, %r8 |
| jle L(StrncpyExit11) |
| palignr $11, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 21(%rcx), %xmm2 |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit11) |
| palignr $11, %xmm3, %xmm2 |
| movaps %xmm2, 16(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit11) |
| movaps %xmm4, 32(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit11) |
| movaps %xmm5, 48(%rdx) |
| lea 16(%rsi), %rsi |
| lea -16(%r8), %r8 |
| |
| L(StrncpyExit11): |
| lea 5(%rdx, %rsi), %rdx |
| lea 5(%rcx, %rsi), %rcx |
| mov -8(%rcx), %rax |
| xor %rsi, %rsi |
| mov %rax, -8(%rdx) |
| jmp L(CopyFrom1To16BytesCase3) |
| |
| .p2align 4 |
| L(StrncpyLeave12): |
| movaps %xmm2, %xmm3 |
| add $48, %r8 |
| jle L(StrncpyExit12) |
| palignr $12, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 20(%rcx), %xmm2 |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit12) |
| palignr $12, %xmm3, %xmm2 |
| movaps %xmm2, 16(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit12) |
| movaps %xmm4, 32(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit12) |
| movaps %xmm5, 48(%rdx) |
| lea 16(%rsi), %rsi |
| lea -16(%r8), %r8 |
| |
| L(StrncpyExit12): |
| lea 4(%rdx, %rsi), %rdx |
| lea 4(%rcx, %rsi), %rcx |
| mov -4(%rcx), %eax |
| xor %rsi, %rsi |
| mov %eax, -4(%rdx) |
| jmp L(CopyFrom1To16BytesCase3) |
| |
| .p2align 4 |
| L(StrncpyLeave13): |
| movaps %xmm2, %xmm3 |
| add $48, %r8 |
| jle L(StrncpyExit13) |
| palignr $13, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 19(%rcx), %xmm2 |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit13) |
| palignr $13, %xmm3, %xmm2 |
| movaps %xmm2, 16(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit13) |
| movaps %xmm4, 32(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit13) |
| movaps %xmm5, 48(%rdx) |
| lea 16(%rsi), %rsi |
| lea -16(%r8), %r8 |
| |
| L(StrncpyExit13): |
| lea 3(%rdx, %rsi), %rdx |
| lea 3(%rcx, %rsi), %rcx |
| mov -4(%rcx), %eax |
| xor %rsi, %rsi |
| mov %eax, -4(%rdx) |
| jmp L(CopyFrom1To16BytesCase3) |
| |
| .p2align 4 |
| L(StrncpyLeave14): |
| movaps %xmm2, %xmm3 |
| add $48, %r8 |
| jle L(StrncpyExit14) |
| palignr $14, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 18(%rcx), %xmm2 |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit14) |
| palignr $14, %xmm3, %xmm2 |
| movaps %xmm2, 16(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit14) |
| movaps %xmm4, 32(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit14) |
| movaps %xmm5, 48(%rdx) |
| lea 16(%rsi), %rsi |
| lea -16(%r8), %r8 |
| |
| L(StrncpyExit14): |
| lea 2(%rdx, %rsi), %rdx |
| lea 2(%rcx, %rsi), %rcx |
| movw -2(%rcx), %ax |
| xor %rsi, %rsi |
| movw %ax, -2(%rdx) |
| jmp L(CopyFrom1To16BytesCase3) |
| |
| .p2align 4 |
| L(StrncpyLeave15): |
| movaps %xmm2, %xmm3 |
| add $48, %r8 |
| jle L(StrncpyExit15) |
| palignr $15, %xmm1, %xmm2 |
| movaps %xmm2, (%rdx) |
| movaps 17(%rcx), %xmm2 |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit15) |
| palignr $15, %xmm3, %xmm2 |
| movaps %xmm2, 16(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit15) |
| movaps %xmm4, 32(%rdx) |
| lea 16(%rsi), %rsi |
| sub $16, %r8 |
| jbe L(StrncpyExit15) |
| movaps %xmm5, 48(%rdx) |
| lea 16(%rsi), %rsi |
| lea -16(%r8), %r8 |
| |
| L(StrncpyExit15): |
| lea 1(%rdx, %rsi), %rdx |
| lea 1(%rcx, %rsi), %rcx |
| movb -1(%rcx), %ah |
| xor %rsi, %rsi |
| movb %ah, -1(%rdx) |
| jmp L(CopyFrom1To16BytesCase3) |
| |
| # endif |
| # ifndef USE_AS_STRCAT |
| END (STRCPY) |
| # endif |
| #endif |