// Copyright 2014 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. #include "textflag.h" // See memmove Go doc for important implementation constraints. // Register map // // dstin R0 // src R1 // count R2 // dst R3 (same as R0, but gets modified in unaligned cases) // srcend R4 // dstend R5 // data R6-R17 // tmp1 R14 // Copies are split into 3 main cases: small copies of up to 32 bytes, medium // copies of up to 128 bytes, and large copies. The overhead of the overlap // check is negligible since it is only required for large copies. // // Large copies use a software pipelined loop processing 64 bytes per iteration. // The destination pointer is 16-byte aligned to minimize unaligned accesses. // The loop tail is handled by always copying 64 bytes from the end. // func memmove(to, from unsafe.Pointer, n uintptr) TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24 CBZ R2, copy0 // Small copies: 1..16 bytes CMP $16, R2 BLE copy16 // Large copies CMP $128, R2 BHI copy_long CMP $32, R2 BHI copy32_128 // Small copies: 17..32 bytes. LDP (R1), (R6, R7) ADD R1, R2, R4 // R4 points just past the last source byte LDP -16(R4), (R12, R13) STP (R6, R7), (R0) ADD R0, R2, R5 // R5 points just past the last destination byte STP (R12, R13), -16(R5) RET // Small copies: 1..16 bytes. copy16: ADD R1, R2, R4 // R4 points just past the last source byte ADD R0, R2, R5 // R5 points just past the last destination byte CMP $8, R2 BLT copy7 MOVD (R1), R6 MOVD -8(R4), R7 MOVD R6, (R0) MOVD R7, -8(R5) RET copy7: TBZ $2, R2, copy3 MOVWU (R1), R6 MOVWU -4(R4), R7 MOVW R6, (R0) MOVW R7, -4(R5) RET copy3: TBZ $1, R2, copy1 MOVHU (R1), R6 MOVHU -2(R4), R7 MOVH R6, (R0) MOVH R7, -2(R5) RET copy1: MOVBU (R1), R6 MOVB R6, (R0) copy0: RET // Medium copies: 33..128 bytes. copy32_128: ADD R1, R2, R4 // R4 points just past the last source byte ADD R0, R2, R5 // R5 points just past the last destination byte LDP (R1), (R6, R7) LDP 16(R1), (R8, R9) LDP -32(R4), (R10, R11) LDP -16(R4), (R12, R13) CMP $64, R2 BHI copy128 STP (R6, R7), (R0) STP (R8, R9), 16(R0) STP (R10, R11), -32(R5) STP (R12, R13), -16(R5) RET // Copy 65..128 bytes. copy128: LDP 32(R1), (R14, R15) LDP 48(R1), (R16, R17) CMP $96, R2 BLS copy96 LDP -64(R4), (R2, R3) LDP -48(R4), (R1, R4) STP (R2, R3), -64(R5) STP (R1, R4), -48(R5) copy96: STP (R6, R7), (R0) STP (R8, R9), 16(R0) STP (R14, R15), 32(R0) STP (R16, R17), 48(R0) STP (R10, R11), -32(R5) STP (R12, R13), -16(R5) RET // Copy more than 128 bytes. copy_long: ADD R1, R2, R4 // R4 points just past the last source byte ADD R0, R2, R5 // R5 points just past the last destination byte MOVD ZR, R7 MOVD ZR, R8 CMP $1024, R2 BLT backward_check // feature detect to decide how to align MOVBU runtime·arm64UseAlignedLoads(SB), R6 CBNZ R6, use_aligned_loads MOVD R0, R7 MOVD R5, R8 B backward_check use_aligned_loads: MOVD R1, R7 MOVD R4, R8 // R7 and R8 are used here for the realignment calculation. In // the use_aligned_loads case, R7 is the src pointer and R8 is // srcend pointer, which is used in the backward copy case. // When doing aligned stores, R7 is the dst pointer and R8 is // the dstend pointer. backward_check: // Use backward copy if there is an overlap. SUB R1, R0, R14 CBZ R14, copy0 CMP R2, R14 BCC copy_long_backward // Copy 16 bytes and then align src (R1) or dst (R0) to 16-byte alignment. LDP (R1), (R12, R13) // Load A AND $15, R7, R14 // Calculate the realignment offset SUB R14, R1, R1 SUB R14, R0, R3 // move dst back same amount as src ADD R14, R2, R2 LDP 16(R1), (R6, R7) // Load B STP (R12, R13), (R0) // Store A LDP 32(R1), (R8, R9) // Load C LDP 48(R1), (R10, R11) // Load D LDP.W 64(R1), (R12, R13) // Load E // 80 bytes have been loaded; if less than 80+64 bytes remain, copy from the end SUBS $144, R2, R2 BLS copy64_from_end loop64: STP (R6, R7), 16(R3) // Store B LDP 16(R1), (R6, R7) // Load B (next iteration) STP (R8, R9), 32(R3) // Store C LDP 32(R1), (R8, R9) // Load C STP (R10, R11), 48(R3) // Store D LDP 48(R1), (R10, R11) // Load D STP.W (R12, R13), 64(R3) // Store E LDP.W 64(R1), (R12, R13) // Load E SUBS $64, R2, R2 BHI loop64 // Write the last iteration and copy 64 bytes from the end. copy64_from_end: LDP -64(R4), (R14, R15) // Load F STP (R6, R7), 16(R3) // Store B LDP -48(R4), (R6, R7) // Load G STP (R8, R9), 32(R3) // Store C LDP -32(R4), (R8, R9) // Load H STP (R10, R11), 48(R3) // Store D LDP -16(R4), (R10, R11) // Load I STP (R12, R13), 64(R3) // Store E STP (R14, R15), -64(R5) // Store F STP (R6, R7), -48(R5) // Store G STP (R8, R9), -32(R5) // Store H STP (R10, R11), -16(R5) // Store I RET // Large backward copy for overlapping copies. // Copy 16 bytes and then align srcend (R4) or dstend (R5) to 16-byte alignment. copy_long_backward: LDP -16(R4), (R12, R13) AND $15, R8, R14 SUB R14, R4, R4 SUB R14, R2, R2 LDP -16(R4), (R6, R7) STP (R12, R13), -16(R5) LDP -32(R4), (R8, R9) LDP -48(R4), (R10, R11) LDP.W -64(R4), (R12, R13) SUB R14, R5, R5 SUBS $128, R2, R2 BLS copy64_from_start loop64_backward: STP (R6, R7), -16(R5) LDP -16(R4), (R6, R7) STP (R8, R9), -32(R5) LDP -32(R4), (R8, R9) STP (R10, R11), -48(R5) LDP -48(R4), (R10, R11) STP.W (R12, R13), -64(R5) LDP.W -64(R4), (R12, R13) SUBS $64, R2, R2 BHI loop64_backward // Write the last iteration and copy 64 bytes from the start. copy64_from_start: LDP 48(R1), (R2, R3) STP (R6, R7), -16(R5) LDP 32(R1), (R6, R7) STP (R8, R9), -32(R5) LDP 16(R1), (R8, R9) STP (R10, R11), -48(R5) LDP (R1), (R10, R11) STP (R12, R13), -64(R5) STP (R2, R3), 48(R0) STP (R6, R7), 32(R0) STP (R8, R9), 16(R0) STP (R10, R11), (R0) RET