| /* Optimized memcpy implementation for CELL BE PowerPC. |
| Copyright (C) 2010-2014 Free Software Foundation, Inc. |
| This file is part of the GNU C Library. |
| |
| The GNU C Library is free software; you can redistribute it and/or |
| modify it under the terms of the GNU Lesser General Public |
| License as published by the Free Software Foundation; either |
| version 2.1 of the License, or (at your option) any later version. |
| |
| The GNU C Library is distributed in the hope that it will be useful, |
| but WITHOUT ANY WARRANTY; without even the implied warranty of |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| Lesser General Public License for more details. |
| |
| You should have received a copy of the GNU Lesser General Public |
| License along with the GNU C Library; if not, see |
| <http://www.gnu.org/licenses/>. */ |
| |
| #include <sysdep.h> |
| |
| #define PREFETCH_AHEAD 6 /* no cache lines SRC prefetching ahead */ |
| #define ZERO_AHEAD 4 /* no cache lines DST zeroing ahead */ |
| |
| /* memcpy routine optimized for CELL-BE-PPC v2.0 |
| * |
| * The CELL PPC core has 1 integer unit and 1 load/store unit |
| * CELL: |
| * 1st level data cache = 32K |
| * 2nd level data cache = 512K |
| * 3rd level data cache = 0K |
| * With 3.2 GHz clockrate the latency to 2nd level cache is >36 clocks, |
| * latency to memory is >400 clocks |
| * To improve copy performance we need to prefetch source data |
| * far ahead to hide this latency |
| * For best performance instruction forms ending in "." like "andi." |
| * should be avoided as the are implemented in microcode on CELL. |
| * The below code is loop unrolled for the CELL cache line of 128 bytes |
| */ |
| |
| .align 7 |
| |
| EALIGN (memcpy, 5, 0) |
| CALL_MCOUNT |
| |
| dcbt 0,r4 /* Prefetch ONE SRC cacheline */ |
| cmplwi cr1,r5,16 /* is size < 16 ? */ |
| mr r6,r3 |
| blt+ cr1,.Lshortcopy |
| |
| .Lbigcopy: |
| neg r8,r3 /* LS 3 bits = # bytes to 8-byte dest bdry */ |
| clrlwi r8,r8,32-4 /* align to 16byte boundary */ |
| sub r7,r4,r3 |
| cmplwi cr0,r8,0 |
| beq+ .Ldst_aligned |
| |
| .Ldst_unaligned: |
| mtcrf 0x01,r8 /* put #bytes to boundary into cr7 */ |
| subf r5,r8,r5 |
| |
| bf cr7*4+3,1f |
| lbzx r0,r7,r6 /* copy 1 byte */ |
| stb r0,0(r6) |
| addi r6,r6,1 |
| 1: bf cr7*4+2,2f |
| lhzx r0,r7,r6 /* copy 2 byte */ |
| sth r0,0(r6) |
| addi r6,r6,2 |
| 2: bf cr7*4+1,4f |
| lwzx r0,r7,r6 /* copy 4 byte */ |
| stw r0,0(r6) |
| addi r6,r6,4 |
| 4: bf cr7*4+0,8f |
| lfdx fp9,r7,r6 /* copy 8 byte */ |
| stfd fp9,0(r6) |
| addi r6,r6,8 |
| 8: |
| add r4,r7,r6 |
| |
| .Ldst_aligned: |
| |
| cmpwi cr5,r5,128-1 |
| |
| neg r7,r6 |
| addi r6,r6,-8 /* prepare for stfdu */ |
| addi r4,r4,-8 /* prepare for lfdu */ |
| |
| clrlwi r7,r7,32-7 /* align to cacheline boundary */ |
| ble+ cr5,.Llessthancacheline |
| |
| cmplwi cr6,r7,0 |
| subf r5,r7,r5 |
| srwi r7,r7,4 /* divide size by 16 */ |
| srwi r10,r5,7 /* number of cache lines to copy */ |
| |
| cmplwi r10,0 |
| li r11,0 /* number cachelines to copy with prefetch */ |
| beq .Lnocacheprefetch |
| |
| cmplwi r10,PREFETCH_AHEAD |
| li r12,128+8 /* prefetch distance */ |
| ble .Llessthanmaxprefetch |
| |
| subi r11,r10,PREFETCH_AHEAD |
| li r10,PREFETCH_AHEAD |
| |
| .Llessthanmaxprefetch: |
| mtctr r10 |
| |
| .LprefetchSRC: |
| dcbt r12,r4 |
| addi r12,r12,128 |
| bdnz .LprefetchSRC |
| |
| .Lnocacheprefetch: |
| mtctr r7 |
| cmplwi cr1,r5,128 |
| clrlwi r5,r5,32-7 |
| beq cr6,.Lcachelinealigned |
| |
| .Laligntocacheline: |
| lfd fp9,0x08(r4) |
| lfdu fp10,0x10(r4) |
| stfd fp9,0x08(r6) |
| stfdu fp10,0x10(r6) |
| bdnz .Laligntocacheline |
| |
| |
| .Lcachelinealigned: /* copy while cache lines */ |
| |
| blt- cr1,.Llessthancacheline /* size <128 */ |
| |
| .Louterloop: |
| cmpwi r11,0 |
| mtctr r11 |
| beq- .Lendloop |
| |
| li r11,128*ZERO_AHEAD +8 /* DCBZ dist */ |
| |
| .align 4 |
| /* Copy whole cachelines, optimized by prefetching SRC cacheline */ |
| .Lloop: /* Copy aligned body */ |
| dcbt r12,r4 /* PREFETCH SOURCE some cache lines ahead */ |
| lfd fp9, 0x08(r4) |
| dcbz r11,r6 |
| lfd fp10, 0x10(r4) /* 4 register stride copy is optimal */ |
| lfd fp11, 0x18(r4) /* to hide 1st level cache latency. */ |
| lfd fp12, 0x20(r4) |
| stfd fp9, 0x08(r6) |
| stfd fp10, 0x10(r6) |
| stfd fp11, 0x18(r6) |
| stfd fp12, 0x20(r6) |
| lfd fp9, 0x28(r4) |
| lfd fp10, 0x30(r4) |
| lfd fp11, 0x38(r4) |
| lfd fp12, 0x40(r4) |
| stfd fp9, 0x28(r6) |
| stfd fp10, 0x30(r6) |
| stfd fp11, 0x38(r6) |
| stfd fp12, 0x40(r6) |
| lfd fp9, 0x48(r4) |
| lfd fp10, 0x50(r4) |
| lfd fp11, 0x58(r4) |
| lfd fp12, 0x60(r4) |
| stfd fp9, 0x48(r6) |
| stfd fp10, 0x50(r6) |
| stfd fp11, 0x58(r6) |
| stfd fp12, 0x60(r6) |
| lfd fp9, 0x68(r4) |
| lfd fp10, 0x70(r4) |
| lfd fp11, 0x78(r4) |
| lfdu fp12, 0x80(r4) |
| stfd fp9, 0x68(r6) |
| stfd fp10, 0x70(r6) |
| stfd fp11, 0x78(r6) |
| stfdu fp12, 0x80(r6) |
| |
| bdnz .Lloop |
| |
| .Lendloop: |
| cmpwi r10,0 |
| slwi r10,r10,2 /* adjust from 128 to 32 byte stride */ |
| beq- .Lendloop2 |
| mtctr r10 |
| |
| .Lloop2: /* Copy aligned body */ |
| lfd fp9, 0x08(r4) |
| lfd fp10, 0x10(r4) |
| lfd fp11, 0x18(r4) |
| lfdu fp12, 0x20(r4) |
| stfd fp9, 0x08(r6) |
| stfd fp10, 0x10(r6) |
| stfd fp11, 0x18(r6) |
| stfdu fp12, 0x20(r6) |
| |
| bdnz .Lloop2 |
| .Lendloop2: |
| |
| .Llessthancacheline: /* less than cache to do ? */ |
| cmplwi cr0,r5,16 |
| srwi r7,r5,4 /* divide size by 16 */ |
| blt- .Ldo_lt16 |
| mtctr r7 |
| |
| .Lcopy_remaining: |
| lfd fp9,0x08(r4) |
| lfdu fp10,0x10(r4) |
| stfd fp9,0x08(r6) |
| stfdu fp10,0x10(r6) |
| bdnz .Lcopy_remaining |
| |
| .Ldo_lt16: /* less than 16 ? */ |
| cmplwi cr0,r5,0 /* copy remaining bytes (0-15) */ |
| beqlr+ /* no rest to copy */ |
| addi r4,r4,8 |
| addi r6,r6,8 |
| |
| .Lshortcopy: /* SIMPLE COPY to handle size =< 15 bytes */ |
| mtcrf 0x01,r5 |
| sub r7,r4,r6 |
| bf- cr7*4+0,8f |
| lfdx fp9,r7,r6 /* copy 8 byte */ |
| stfd fp9,0(r6) |
| addi r6,r6,8 |
| 8: |
| bf cr7*4+1,4f |
| lwzx r0,r7,r6 /* copy 4 byte */ |
| stw r0,0(r6) |
| addi r6,r6,4 |
| 4: |
| bf cr7*4+2,2f |
| lhzx r0,r7,r6 /* copy 2 byte */ |
| sth r0,0(r6) |
| addi r6,r6,2 |
| 2: |
| bf cr7*4+3,1f |
| lbzx r0,r7,r6 /* copy 1 byte */ |
| stb r0,0(r6) |
| 1: blr |
| |
| END (memcpy) |
| libc_hidden_builtin_def (memcpy) |