| /* strchr (str, ch) -- Return pointer to first occurrence of CH in STR. |
| For AMD x86-64. |
| Copyright (C) 2009-2014 Free Software Foundation, Inc. |
| This file is part of the GNU C Library. |
| |
| The GNU C Library is free software; you can redistribute it and/or |
| modify it under the terms of the GNU Lesser General Public |
| License as published by the Free Software Foundation; either |
| version 2.1 of the License, or (at your option) any later version. |
| |
| The GNU C Library is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| Lesser General Public License for more details. |
| |
| You should have received a copy of the GNU Lesser General Public |
| License along with the GNU C Library; if not, see |
| <http://www.gnu.org/licenses/>. */ |
| |
| #include <sysdep.h> |
| |
| .text |
| ENTRY (strchr) |
| movd %esi, %xmm1 |
| movl %edi, %eax |
| andl $4095, %eax |
| punpcklbw %xmm1, %xmm1 |
| cmpl $4032, %eax |
| punpcklwd %xmm1, %xmm1 |
| pshufd $0, %xmm1, %xmm1 |
| jg L(cross_page) |
| movdqu (%rdi), %xmm0 |
| pxor %xmm3, %xmm3 |
| movdqa %xmm0, %xmm4 |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm3, %xmm4 |
| por %xmm4, %xmm0 |
| pmovmskb %xmm0, %eax |
| test %eax, %eax |
| je L(next_48_bytes) |
| bsf %eax, %eax |
| #ifdef AS_STRCHRNUL |
| leaq (%rdi,%rax), %rax |
| #else |
| movl $0, %edx |
| leaq (%rdi,%rax), %rax |
| cmpb %sil, (%rax) |
| cmovne %rdx, %rax |
| #endif |
| ret |
| |
| .p2align 3 |
| L(next_48_bytes): |
| movdqu 16(%rdi), %xmm0 |
| movdqa %xmm0, %xmm4 |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm3, %xmm4 |
| por %xmm4, %xmm0 |
| pmovmskb %xmm0, %ecx |
| movdqu 32(%rdi), %xmm0 |
| movdqa %xmm0, %xmm4 |
| pcmpeqb %xmm1, %xmm0 |
| salq $16, %rcx |
| pcmpeqb %xmm3, %xmm4 |
| por %xmm4, %xmm0 |
| pmovmskb %xmm0, %eax |
| movdqu 48(%rdi), %xmm0 |
| pcmpeqb %xmm0, %xmm3 |
| salq $32, %rax |
| pcmpeqb %xmm1, %xmm0 |
| orq %rcx, %rax |
| por %xmm3, %xmm0 |
| pmovmskb %xmm0, %ecx |
| salq $48, %rcx |
| orq %rcx, %rax |
| testq %rax, %rax |
| jne L(return) |
| L(loop_start): |
| /* We use this alignment to force loop be aligned to 8 but not |
| 16 bytes. This gives better sheduling on AMD processors. */ |
| .p2align 4 |
| pxor %xmm6, %xmm6 |
| andq $-64, %rdi |
| .p2align 3 |
| L(loop64): |
| addq $64, %rdi |
| movdqa (%rdi), %xmm5 |
| movdqa 16(%rdi), %xmm2 |
| movdqa 32(%rdi), %xmm3 |
| pxor %xmm1, %xmm5 |
| movdqa 48(%rdi), %xmm4 |
| pxor %xmm1, %xmm2 |
| pxor %xmm1, %xmm3 |
| pminub (%rdi), %xmm5 |
| pxor %xmm1, %xmm4 |
| pminub 16(%rdi), %xmm2 |
| pminub 32(%rdi), %xmm3 |
| pminub %xmm2, %xmm5 |
| pminub 48(%rdi), %xmm4 |
| pminub %xmm3, %xmm5 |
| pminub %xmm4, %xmm5 |
| pcmpeqb %xmm6, %xmm5 |
| pmovmskb %xmm5, %eax |
| |
| testl %eax, %eax |
| je L(loop64) |
| |
| movdqa (%rdi), %xmm5 |
| movdqa %xmm5, %xmm0 |
| pcmpeqb %xmm1, %xmm5 |
| pcmpeqb %xmm6, %xmm0 |
| por %xmm0, %xmm5 |
| pcmpeqb %xmm6, %xmm2 |
| pcmpeqb %xmm6, %xmm3 |
| pcmpeqb %xmm6, %xmm4 |
| |
| pmovmskb %xmm5, %ecx |
| pmovmskb %xmm2, %eax |
| salq $16, %rax |
| pmovmskb %xmm3, %r8d |
| pmovmskb %xmm4, %edx |
| salq $32, %r8 |
| orq %r8, %rax |
| orq %rcx, %rax |
| salq $48, %rdx |
| orq %rdx, %rax |
| .p2align 3 |
| L(return): |
| bsfq %rax, %rax |
| #ifdef AS_STRCHRNUL |
| leaq (%rdi,%rax), %rax |
| #else |
| movl $0, %edx |
| leaq (%rdi,%rax), %rax |
| cmpb %sil, (%rax) |
| cmovne %rdx, %rax |
| #endif |
| ret |
| .p2align 4 |
| |
| L(cross_page): |
| movq %rdi, %rdx |
| pxor %xmm2, %xmm2 |
| andq $-64, %rdx |
| movdqa %xmm1, %xmm0 |
| movdqa (%rdx), %xmm3 |
| movdqa %xmm3, %xmm4 |
| pcmpeqb %xmm1, %xmm3 |
| pcmpeqb %xmm2, %xmm4 |
| por %xmm4, %xmm3 |
| pmovmskb %xmm3, %r8d |
| movdqa 16(%rdx), %xmm3 |
| movdqa %xmm3, %xmm4 |
| pcmpeqb %xmm1, %xmm3 |
| pcmpeqb %xmm2, %xmm4 |
| por %xmm4, %xmm3 |
| pmovmskb %xmm3, %eax |
| movdqa 32(%rdx), %xmm3 |
| movdqa %xmm3, %xmm4 |
| pcmpeqb %xmm1, %xmm3 |
| salq $16, %rax |
| pcmpeqb %xmm2, %xmm4 |
| por %xmm4, %xmm3 |
| pmovmskb %xmm3, %r9d |
| movdqa 48(%rdx), %xmm3 |
| pcmpeqb %xmm3, %xmm2 |
| salq $32, %r9 |
| pcmpeqb %xmm3, %xmm0 |
| orq %r9, %rax |
| orq %r8, %rax |
| por %xmm2, %xmm0 |
| pmovmskb %xmm0, %ecx |
| salq $48, %rcx |
| orq %rcx, %rax |
| movl %edi, %ecx |
| subb %dl, %cl |
| shrq %cl, %rax |
| testq %rax, %rax |
| jne L(return) |
| jmp L(loop_start) |
| |
| END (strchr) |
| |
| #ifndef AS_STRCHRNUL |
| weak_alias (strchr, index) |
| libc_hidden_builtin_def (strchr) |
| #endif |