Text file src/internal/bytealg/equal_ppc64x.s

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build ppc64 || ppc64le
     6  
     7  #include "go_asm.h"
     8  #include "textflag.h"
     9  
    10  // 4K (smallest case) page size offset mask for PPC64.
    11  #define PAGE_OFFSET 4095
    12  
    13  // TODO: At writing, ISEL and BC do not support CR bit type arguments,
    14  // define them here for readability.
    15  #define CR0LT 4*0+0
    16  #define CR0EQ 4*0+2
    17  #define CR1LT 4*1+0
    18  #define CR6LT 4*6+0
    19  
    20  // Likewise, the BC opcode is hard to read, and no extended
    21  // mnemonics are offered for these forms.
    22  #define BGELR_CR6 BC  4, CR6LT, (LR)
    23  #define BEQLR     BC 12, CR0EQ, (LR)
    24  
    25  // memequal(a, b unsafe.Pointer, size uintptr) bool
    26  TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25
    27  	// R3 = a
    28  	// R4 = b
    29  	// R5 = size
    30  	BR	memeqbody<>(SB)
    31  
    32  // memequal_varlen(a, b unsafe.Pointer) bool
    33  TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-17
    34  	// R3 = a
    35  	// R4 = b
    36  	CMP	R3, R4
    37  	BEQ	eq
    38  	MOVD	8(R11), R5    // compiler stores size at offset 8 in the closure
    39  	BR	memeqbody<>(SB)
    40  eq:
    41  	MOVD	$1, R3
    42  	RET
    43  
    44  // Do an efficient memequal for ppc64
    45  // R3 = s1
    46  // R4 = s2
    47  // R5 = len
    48  // On exit:
    49  // R3 = return value
    50  TEXT memeqbody<>(SB),NOSPLIT|NOFRAME,$0-0
    51  	MOVD	R3, R8		// Move s1 into R8
    52  	ADD	R5, R3, R9	// &s1[len(s1)]
    53  	ADD	R5, R4, R10	// &s2[len(s2)]
    54  	MOVD	$1, R11
    55  	CMP	R5, $16		// Use GPR checks for check for len <= 16
    56  	BLE	check0_16
    57  	MOVD	$0, R3		// Assume no-match in case BGELR CR6 returns
    58  	CMP	R5, $32		// Use overlapping VSX loads for len <= 32
    59  	BLE	check17_32	// Do a pair of overlapping VSR compares
    60  	CMP	R5, $64
    61  	BLE	check33_64	// Hybrid check + overlap compare.
    62  
    63  setup64:
    64  	SRD	$6, R5, R6	// number of 64 byte chunks to compare
    65  	MOVD	R6, CTR
    66  	MOVD	$16, R14	// index for VSX loads and stores
    67  	MOVD	$32, R15
    68  	MOVD	$48, R16
    69  	ANDCC	$0x3F, R5, R5	// len%64==0?
    70  
    71  	PCALIGN $32
    72  loop64:
    73  	LXVD2X	(R8+R0), V0
    74  	LXVD2X	(R4+R0), V1
    75  	VCMPEQUBCC V0, V1, V2	// compare, setting CR6
    76  	BGELR_CR6
    77  	LXVD2X	(R8+R14), V0
    78  	LXVD2X	(R4+R14), V1
    79  	VCMPEQUBCC	V0, V1, V2
    80  	BGELR_CR6
    81  	LXVD2X	(R8+R15), V0
    82  	LXVD2X	(R4+R15), V1
    83  	VCMPEQUBCC	V0, V1, V2
    84  	BGELR_CR6
    85  	LXVD2X	(R8+R16), V0
    86  	LXVD2X	(R4+R16), V1
    87  	VCMPEQUBCC	V0, V1, V2
    88  	BGELR_CR6
    89  	ADD	$64,R8		// bump up to next 64
    90  	ADD	$64,R4
    91  	BDNZ	loop64
    92  
    93  	ISEL	$CR0EQ, R11, R3, R3	// If no tail, return 1, otherwise R3 remains 0.
    94  	BEQLR				// return if no tail.
    95  
    96  	ADD	$-64, R9, R8
    97  	ADD	$-64, R10, R4
    98  	LXVD2X	(R8+R0), V0
    99  	LXVD2X	(R4+R0), V1
   100  	VCMPEQUBCC	V0, V1, V2
   101  	BGELR_CR6
   102  	LXVD2X	(R8+R14), V0
   103  	LXVD2X	(R4+R14), V1
   104  	VCMPEQUBCC	V0, V1, V2
   105  	BGELR_CR6
   106  	LXVD2X	(R8+R15), V0
   107  	LXVD2X	(R4+R15), V1
   108  	VCMPEQUBCC	V0, V1, V2
   109  	BGELR_CR6
   110  	LXVD2X	(R8+R16), V0
   111  	LXVD2X	(R4+R16), V1
   112  	VCMPEQUBCC	V0, V1, V2
   113  	ISEL	$CR6LT, R11, R0, R3
   114  	RET
   115  
   116  check33_64:
   117  	// Bytes 0-15
   118  	LXVD2X	(R8+R0), V0
   119  	LXVD2X	(R4+R0), V1
   120  	VCMPEQUBCC	V0, V1, V2
   121  	BGELR_CR6
   122  	ADD	$16, R8
   123  	ADD	$16, R4
   124  
   125  	// Bytes 16-31
   126  	LXVD2X	(R8+R0), V0
   127  	LXVD2X	(R4+R0), V1
   128  	VCMPEQUBCC	V0, V1, V2
   129  	BGELR_CR6
   130  
   131  	// A little tricky, but point R4,R8 to &sx[len-32],
   132  	// and reuse check17_32 to check the next 1-31 bytes (with some overlap)
   133  	ADD	$-32, R9, R8
   134  	ADD	$-32, R10, R4
   135  	// Fallthrough
   136  
   137  check17_32:
   138  	LXVD2X	(R8+R0), V0
   139  	LXVD2X	(R4+R0), V1
   140  	VCMPEQUBCC	V0, V1, V2
   141  	ISEL	$CR6LT, R11, R0, R5
   142  
   143  	// Load sX[len(sX)-16:len(sX)] and compare.
   144  	ADD	$-16, R9
   145  	ADD	$-16, R10
   146  	LXVD2X	(R9+R0), V0
   147  	LXVD2X	(R10+R0), V1
   148  	VCMPEQUBCC	V0, V1, V2
   149  	ISEL	$CR6LT, R5, R0, R3
   150  	RET
   151  
   152  check0_16:
   153  	CMP	R5, $8
   154  	BLT	check0_7
   155  	// Load sX[0:7] and compare.
   156  	MOVD	(R8), R6
   157  	MOVD	(R4), R7
   158  	CMP	R6, R7
   159  	ISEL	$CR0EQ, R11, R0, R5
   160  	// Load sX[len(sX)-8:len(sX)] and compare.
   161  	MOVD	-8(R9), R6
   162  	MOVD	-8(R10), R7
   163  	CMP	R6, R7
   164  	ISEL	$CR0EQ, R5, R0, R3
   165  	RET
   166  
   167  check0_7:
   168  	CMP	R5,$0
   169  	MOVD	$1, R3
   170  	BEQLR		// return if len == 0
   171  
   172  	// Check < 8B loads with a single compare, but select the load address
   173  	// such that it cannot cross a page boundary. Load a few bytes from the
   174  	// lower address if that does not cross the lower page. Or, load a few
   175  	// extra bytes from the higher addresses. And align those values
   176  	// consistently in register as either address may have differing
   177  	// alignment requirements.
   178  	ANDCC	$PAGE_OFFSET, R8, R6	// &sX & PAGE_OFFSET
   179  	ANDCC	$PAGE_OFFSET, R4, R9
   180  	SUBC	R5, $8, R12		// 8-len
   181  	SLD	$3, R12, R14		// (8-len)*8
   182  	CMPU	R6, R12, CR1		// Enough bytes lower in the page to load lower?
   183  	CMPU	R9, R12, CR0
   184  	SUB	R12, R8, R6		// compute lower load address
   185  	SUB	R12, R4, R9
   186  	ISEL	$CR1LT, R8, R6, R8	// R8 = R6 < 0 ? R8 (&s1) : R6 (&s1 - (8-len))
   187  	ISEL	$CR0LT, R4, R9, R4	// Similar for s2
   188  	MOVD	(R8), R15
   189  	MOVD	(R4), R16
   190  	SLD	R14, R15, R7
   191  	SLD	R14, R16, R17
   192  	SRD	R14, R7, R7		// Clear the upper (8-len) bytes (with 2 shifts)
   193  	SRD	R14, R17, R17
   194  	SRD	R14, R15, R6		// Clear the lower (8-len) bytes
   195  	SRD	R14, R16, R9
   196  #ifdef GOARCH_ppc64le
   197  	ISEL	$CR1LT, R7, R6, R8      // Choose the correct len bytes to compare based on alignment
   198  	ISEL	$CR0LT, R17, R9, R4
   199  #else
   200  	ISEL	$CR1LT, R6, R7, R8
   201  	ISEL	$CR0LT, R9, R17, R4
   202  #endif
   203  	CMP	R4, R8
   204  	ISEL	$CR0EQ, R11, R0, R3
   205  	RET
   206  

View as plain text