// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

#include "textflag.h"

// See memmove Go doc for important implementation constraints.

// Register map
//
// dstin  R0
// src    R1
// count  R2
// dst    R3 (same as R0, but gets modified in unaligned cases)
// srcend R4
// dstend R5
// data   R6-R17
// tmp1   R14

// Copies are split into 3 main cases: small copies of up to 32 bytes, medium
// copies of up to 128 bytes, and large copies. The overhead of the overlap
// check is negligible since it is only required for large copies.
//
// Large copies use a software pipelined loop processing 64 bytes per iteration.
// The destination pointer is 16-byte aligned to minimize unaligned accesses.
// The loop tail is handled by always copying 64 bytes from the end.

// func memmove(to, from unsafe.Pointer, n uintptr)
TEXT runtime·memmove<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-24
	CBZ	R2, copy0

	// Small copies: 1..16 bytes
	CMP	$16, R2
	BLE	copy16

	// Large copies
	CMP	$128, R2
	BHI	copy_long
	CMP	$32, R2
	BHI	copy32_128

	// Small copies: 17..32 bytes.
	LDP	(R1), (R6, R7)
	ADD	R1, R2, R4          // R4 points just past the last source byte
	LDP	-16(R4), (R12, R13)
	STP	(R6, R7), (R0)
	ADD	R0, R2, R5          // R5 points just past the last destination byte
	STP	(R12, R13), -16(R5)
	RET

// Small copies: 1..16 bytes.
copy16:
	ADD	R1, R2, R4 // R4 points just past the last source byte
	ADD	R0, R2, R5 // R5 points just past the last destination byte
	CMP	$8, R2
	BLT	copy7
	MOVD	(R1), R6
	MOVD	-8(R4), R7
	MOVD	R6, (R0)
	MOVD	R7, -8(R5)
	RET

copy7:
	TBZ	$2, R2, copy3
	MOVWU	(R1), R6
	MOVWU	-4(R4), R7
	MOVW	R6, (R0)
	MOVW	R7, -4(R5)
	RET

copy3:
	TBZ	$1, R2, copy1
	MOVHU	(R1), R6
	MOVHU	-2(R4), R7
	MOVH	R6, (R0)
	MOVH	R7, -2(R5)
	RET

copy1:
	MOVBU	(R1), R6
	MOVB	R6, (R0)

copy0:
	RET

	// Medium copies: 33..128 bytes.
copy32_128:
	ADD	R1, R2, R4          // R4 points just past the last source byte
	ADD	R0, R2, R5          // R5 points just past the last destination byte
	LDP	(R1), (R6, R7)
	LDP	16(R1), (R8, R9)
	LDP	-32(R4), (R10, R11)
	LDP	-16(R4), (R12, R13)
	CMP	$64, R2
	BHI	copy128
	STP	(R6, R7), (R0)
	STP	(R8, R9), 16(R0)
	STP	(R10, R11), -32(R5)
	STP	(R12, R13), -16(R5)
	RET

	// Copy 65..128 bytes.
copy128:
	LDP	32(R1), (R14, R15)
	LDP	48(R1), (R16, R17)
	CMP	$96, R2
	BLS	copy96
	LDP	-64(R4), (R2, R3)
	LDP	-48(R4), (R1, R4)
	STP	(R2, R3), -64(R5)
	STP	(R1, R4), -48(R5)

copy96:
	STP	(R6, R7), (R0)
	STP	(R8, R9), 16(R0)
	STP	(R14, R15), 32(R0)
	STP	(R16, R17), 48(R0)
	STP	(R10, R11), -32(R5)
	STP	(R12, R13), -16(R5)
	RET

	// Copy more than 128 bytes.
copy_long:
	ADD	R1, R2, R4 // R4 points just past the last source byte
	ADD	R0, R2, R5 // R5 points just past the last destination byte
	MOVD	ZR, R7
	MOVD	ZR, R8

	CMP	$1024, R2
	BLT	backward_check
	// feature detect to decide how to align
	MOVBU	runtime·arm64UseAlignedLoads(SB), R6
	CBNZ	R6, use_aligned_loads
	MOVD	R0, R7
	MOVD	R5, R8
	B	backward_check
use_aligned_loads:
	MOVD	R1, R7
	MOVD	R4, R8
	// R7 and R8 are used here for the realignment calculation. In
	// the use_aligned_loads case, R7 is the src pointer and R8 is
	// srcend pointer, which is used in the backward copy case.
	// When doing aligned stores, R7 is the dst pointer and R8 is
	// the dstend pointer.

backward_check:
	// Use backward copy if there is an overlap.
	SUB	R1, R0, R14
	CBZ	R14, copy0
	CMP	R2, R14
	BCC	copy_long_backward

	// Copy 16 bytes and then align src (R1) or dst (R0) to 16-byte alignment.
	LDP	(R1), (R12, R13)     // Load  A
	AND	$15, R7, R14         // Calculate the realignment offset
	SUB	R14, R1, R1
	SUB	R14, R0, R3          // move dst back same amount as src
	ADD	R14, R2, R2
	LDP	16(R1), (R6, R7)     // Load   B
	STP	(R12, R13), (R0)     // Store A
	LDP	32(R1), (R8, R9)     // Load    C
	LDP	48(R1), (R10, R11)   // Load     D
	LDP.W	64(R1), (R12, R13)   // Load      E
	// 80 bytes have been loaded; if less than 80+64 bytes remain, copy from the end
	SUBS	$144, R2, R2
	BLS	copy64_from_end

loop64:
	STP	(R6, R7), 16(R3)     // Store  B
	LDP	16(R1), (R6, R7)     // Load   B (next iteration)
	STP	(R8, R9), 32(R3)     // Store   C
	LDP	32(R1), (R8, R9)     // Load    C
	STP	(R10, R11), 48(R3)   // Store    D
	LDP	48(R1), (R10, R11)   // Load     D
	STP.W	(R12, R13), 64(R3)   // Store     E
	LDP.W	64(R1), (R12, R13)   // Load      E
	SUBS	$64, R2, R2
	BHI	loop64

	// Write the last iteration and copy 64 bytes from the end.
copy64_from_end:
	LDP	-64(R4), (R14, R15)  // Load       F
	STP	(R6, R7), 16(R3)     // Store  B
	LDP	-48(R4), (R6, R7)    // Load        G
	STP	(R8, R9), 32(R3)     // Store   C
	LDP	-32(R4), (R8, R9)    // Load         H
	STP	(R10, R11), 48(R3)   // Store    D
	LDP	-16(R4), (R10, R11)  // Load          I
	STP	(R12, R13), 64(R3)   // Store     E
	STP	(R14, R15), -64(R5)  // Store      F
	STP	(R6, R7), -48(R5)    // Store       G
	STP	(R8, R9), -32(R5)    // Store        H
	STP	(R10, R11), -16(R5)  // Store         I
	RET

	// Large backward copy for overlapping copies.
	// Copy 16 bytes and then align srcend (R4) or dstend (R5) to 16-byte alignment.
copy_long_backward:
	LDP	-16(R4), (R12, R13)
	AND	$15, R8, R14
	SUB	R14, R4, R4
	SUB	R14, R2, R2
	LDP	-16(R4), (R6, R7)
	STP	(R12, R13), -16(R5)
	LDP	-32(R4), (R8, R9)
	LDP	-48(R4), (R10, R11)
	LDP.W	-64(R4), (R12, R13)
	SUB	R14, R5, R5
	SUBS	$128, R2, R2
	BLS	copy64_from_start

loop64_backward:
	STP	(R6, R7), -16(R5)
	LDP	-16(R4), (R6, R7)
	STP	(R8, R9), -32(R5)
	LDP	-32(R4), (R8, R9)
	STP	(R10, R11), -48(R5)
	LDP	-48(R4), (R10, R11)
	STP.W	(R12, R13), -64(R5)
	LDP.W	-64(R4), (R12, R13)
	SUBS	$64, R2, R2
	BHI	loop64_backward

	// Write the last iteration and copy 64 bytes from the start.
copy64_from_start:
	LDP	48(R1), (R2, R3)
	STP	(R6, R7), -16(R5)
	LDP	32(R1), (R6, R7)
	STP	(R8, R9), -32(R5)
	LDP	16(R1), (R8, R9)
	STP	(R10, R11), -48(R5)
	LDP	(R1), (R10, R11)
	STP	(R12, R13), -64(R5)
	STP	(R2, R3), 48(R0)
	STP	(R6, R7), 32(R0)
	STP	(R8, R9), 16(R0)
	STP	(R10, R11), (R0)
	RET