| /* fast SSE2 memrchr with 64 byte loop and pmaxub instruction using |
| |
| Copyright (C) 2011-2014 Free Software Foundation, Inc. |
| Contributed by Intel Corporation. |
| This file is part of the GNU C Library. |
| |
| The GNU C Library is free software; you can redistribute it and/or |
| modify it under the terms of the GNU Lesser General Public |
| License as published by the Free Software Foundation; either |
| version 2.1 of the License, or (at your option) any later version. |
| |
| The GNU C Library is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| Lesser General Public License for more details. |
| |
| You should have received a copy of the GNU Lesser General Public |
| License along with the GNU C Library; if not, see |
| <http://www.gnu.org/licenses/>. */ |
| |
| #include <sysdep.h> |
| |
| .text |
| ENTRY (memrchr) |
| movd %rsi, %xmm1 |
| |
| sub $16, %rdx |
| jbe L(length_less16) |
| |
| punpcklbw %xmm1, %xmm1 |
| punpcklbw %xmm1, %xmm1 |
| |
| add %rdx, %rdi |
| pshufd $0, %xmm1, %xmm1 |
| |
| movdqu (%rdi), %xmm0 |
| pcmpeqb %xmm1, %xmm0 |
| |
| /* Check if there is a match. */ |
| pmovmskb %xmm0, %eax |
| test %eax, %eax |
| jnz L(matches0) |
| |
| sub $64, %rdi |
| mov %rdi, %rcx |
| and $15, %rcx |
| jz L(loop_prolog) |
| |
| add $16, %rdi |
| add $16, %rdx |
| and $-16, %rdi |
| sub %rcx, %rdx |
| |
| .p2align 4 |
| L(loop_prolog): |
| sub $64, %rdx |
| jbe L(exit_loop) |
| |
| movdqa 48(%rdi), %xmm0 |
| pcmpeqb %xmm1, %xmm0 |
| pmovmskb %xmm0, %eax |
| test %eax, %eax |
| jnz L(matches48) |
| |
| movdqa 32(%rdi), %xmm2 |
| pcmpeqb %xmm1, %xmm2 |
| pmovmskb %xmm2, %eax |
| test %eax, %eax |
| jnz L(matches32) |
| |
| movdqa 16(%rdi), %xmm3 |
| pcmpeqb %xmm1, %xmm3 |
| pmovmskb %xmm3, %eax |
| test %eax, %eax |
| jnz L(matches16) |
| |
| movdqa (%rdi), %xmm4 |
| pcmpeqb %xmm1, %xmm4 |
| pmovmskb %xmm4, %eax |
| test %eax, %eax |
| jnz L(matches0) |
| |
| sub $64, %rdi |
| sub $64, %rdx |
| jbe L(exit_loop) |
| |
| movdqa 48(%rdi), %xmm0 |
| pcmpeqb %xmm1, %xmm0 |
| pmovmskb %xmm0, %eax |
| test %eax, %eax |
| jnz L(matches48) |
| |
| movdqa 32(%rdi), %xmm2 |
| pcmpeqb %xmm1, %xmm2 |
| pmovmskb %xmm2, %eax |
| test %eax, %eax |
| jnz L(matches32) |
| |
| movdqa 16(%rdi), %xmm3 |
| pcmpeqb %xmm1, %xmm3 |
| pmovmskb %xmm3, %eax |
| test %eax, %eax |
| jnz L(matches16) |
| |
| movdqa (%rdi), %xmm3 |
| pcmpeqb %xmm1, %xmm3 |
| pmovmskb %xmm3, %eax |
| test %eax, %eax |
| jnz L(matches0) |
| |
| mov %rdi, %rcx |
| and $63, %rcx |
| jz L(align64_loop) |
| |
| add $64, %rdi |
| add $64, %rdx |
| and $-64, %rdi |
| sub %rcx, %rdx |
| |
| .p2align 4 |
| L(align64_loop): |
| sub $64, %rdi |
| sub $64, %rdx |
| jbe L(exit_loop) |
| |
| movdqa (%rdi), %xmm0 |
| movdqa 16(%rdi), %xmm2 |
| movdqa 32(%rdi), %xmm3 |
| movdqa 48(%rdi), %xmm4 |
| |
| pcmpeqb %xmm1, %xmm0 |
| pcmpeqb %xmm1, %xmm2 |
| pcmpeqb %xmm1, %xmm3 |
| pcmpeqb %xmm1, %xmm4 |
| |
| pmaxub %xmm3, %xmm0 |
| pmaxub %xmm4, %xmm2 |
| pmaxub %xmm0, %xmm2 |
| pmovmskb %xmm2, %eax |
| |
| test %eax, %eax |
| jz L(align64_loop) |
| |
| pmovmskb %xmm4, %eax |
| test %eax, %eax |
| jnz L(matches48) |
| |
| pmovmskb %xmm3, %eax |
| test %eax, %eax |
| jnz L(matches32) |
| |
| movdqa 16(%rdi), %xmm2 |
| |
| pcmpeqb %xmm1, %xmm2 |
| pcmpeqb (%rdi), %xmm1 |
| |
| pmovmskb %xmm2, %eax |
| test %eax, %eax |
| jnz L(matches16) |
| |
| pmovmskb %xmm1, %eax |
| bsr %eax, %eax |
| |
| add %rdi, %rax |
| ret |
| |
| .p2align 4 |
| L(exit_loop): |
| add $64, %rdx |
| cmp $32, %rdx |
| jbe L(exit_loop_32) |
| |
| movdqa 48(%rdi), %xmm0 |
| pcmpeqb %xmm1, %xmm0 |
| pmovmskb %xmm0, %eax |
| test %eax, %eax |
| jnz L(matches48) |
| |
| movdqa 32(%rdi), %xmm2 |
| pcmpeqb %xmm1, %xmm2 |
| pmovmskb %xmm2, %eax |
| test %eax, %eax |
| jnz L(matches32) |
| |
| movdqa 16(%rdi), %xmm3 |
| pcmpeqb %xmm1, %xmm3 |
| pmovmskb %xmm3, %eax |
| test %eax, %eax |
| jnz L(matches16_1) |
| cmp $48, %rdx |
| jbe L(return_null) |
| |
| pcmpeqb (%rdi), %xmm1 |
| pmovmskb %xmm1, %eax |
| test %eax, %eax |
| jnz L(matches0_1) |
| xor %eax, %eax |
| ret |
| |
| .p2align 4 |
| L(exit_loop_32): |
| movdqa 48(%rdi), %xmm0 |
| pcmpeqb %xmm1, %xmm0 |
| pmovmskb %xmm0, %eax |
| test %eax, %eax |
| jnz L(matches48_1) |
| cmp $16, %rdx |
| jbe L(return_null) |
| |
| pcmpeqb 32(%rdi), %xmm1 |
| pmovmskb %xmm1, %eax |
| test %eax, %eax |
| jnz L(matches32_1) |
| xor %eax, %eax |
| ret |
| |
| .p2align 4 |
| L(matches0): |
| bsr %eax, %eax |
| add %rdi, %rax |
| ret |
| |
| .p2align 4 |
| L(matches16): |
| bsr %eax, %eax |
| lea 16(%rax, %rdi), %rax |
| ret |
| |
| .p2align 4 |
| L(matches32): |
| bsr %eax, %eax |
| lea 32(%rax, %rdi), %rax |
| ret |
| |
| .p2align 4 |
| L(matches48): |
| bsr %eax, %eax |
| lea 48(%rax, %rdi), %rax |
| ret |
| |
| .p2align 4 |
| L(matches0_1): |
| bsr %eax, %eax |
| sub $64, %rdx |
| add %rax, %rdx |
| jl L(return_null) |
| add %rdi, %rax |
| ret |
| |
| .p2align 4 |
| L(matches16_1): |
| bsr %eax, %eax |
| sub $48, %rdx |
| add %rax, %rdx |
| jl L(return_null) |
| lea 16(%rdi, %rax), %rax |
| ret |
| |
| .p2align 4 |
| L(matches32_1): |
| bsr %eax, %eax |
| sub $32, %rdx |
| add %rax, %rdx |
| jl L(return_null) |
| lea 32(%rdi, %rax), %rax |
| ret |
| |
| .p2align 4 |
| L(matches48_1): |
| bsr %eax, %eax |
| sub $16, %rdx |
| add %rax, %rdx |
| jl L(return_null) |
| lea 48(%rdi, %rax), %rax |
| ret |
| |
| .p2align 4 |
| L(return_null): |
| xor %rax, %rax |
| ret |
| |
| .p2align 4 |
| L(length_less16_offset0): |
| test %edx, %edx |
| jz L(return_null) |
| |
| mov %dl, %cl |
| pcmpeqb (%rdi), %xmm1 |
| |
| mov $1, %edx |
| sal %cl, %edx |
| sub $1, %edx |
| |
| pmovmskb %xmm1, %eax |
| |
| and %edx, %eax |
| test %eax, %eax |
| jz L(return_null) |
| |
| bsr %eax, %eax |
| add %rdi, %rax |
| ret |
| |
| .p2align 4 |
| L(length_less16): |
| punpcklbw %xmm1, %xmm1 |
| punpcklbw %xmm1, %xmm1 |
| |
| add $16, %rdx |
| |
| pshufd $0, %xmm1, %xmm1 |
| |
| mov %rdi, %rcx |
| and $15, %rcx |
| jz L(length_less16_offset0) |
| |
| mov %rdi, %rcx |
| and $15, %rcx |
| mov %cl, %dh |
| mov %rcx, %r8 |
| add %dl, %dh |
| and $-16, %rdi |
| |
| sub $16, %dh |
| ja L(length_less16_part2) |
| |
| pcmpeqb (%rdi), %xmm1 |
| pmovmskb %xmm1, %eax |
| |
| sar %cl, %eax |
| mov %dl, %cl |
| |
| mov $1, %edx |
| sal %cl, %edx |
| sub $1, %edx |
| |
| and %edx, %eax |
| test %eax, %eax |
| jz L(return_null) |
| |
| bsr %eax, %eax |
| add %rdi, %rax |
| add %r8, %rax |
| ret |
| |
| .p2align 4 |
| L(length_less16_part2): |
| movdqa 16(%rdi), %xmm2 |
| pcmpeqb %xmm1, %xmm2 |
| pmovmskb %xmm2, %eax |
| |
| mov %dh, %cl |
| mov $1, %edx |
| sal %cl, %edx |
| sub $1, %edx |
| |
| and %edx, %eax |
| |
| test %eax, %eax |
| jnz L(length_less16_part2_return) |
| |
| pcmpeqb (%rdi), %xmm1 |
| pmovmskb %xmm1, %eax |
| |
| mov %r8, %rcx |
| sar %cl, %eax |
| test %eax, %eax |
| jz L(return_null) |
| |
| bsr %eax, %eax |
| add %rdi, %rax |
| add %r8, %rax |
| ret |
| |
| .p2align 4 |
| L(length_less16_part2_return): |
| bsr %eax, %eax |
| lea 16(%rax, %rdi), %rax |
| ret |
| |
| END (memrchr) |
| strong_alias (memrchr, __memrchr) |