indexbyte_ppc64x.s

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build ppc64 || ppc64le
     6  
     7  #include "go_asm.h"
     8  #include "textflag.h"
     9  
    10  TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
    11  	// R3 = byte array pointer
    12  	// R4 = length
    13  	MOVD	R6, R5		// R5 = byte
    14  	BR	indexbytebody<>(SB)
    15  
    16  TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32
    17  	// R3 = string
    18  	// R4 = length
    19  	// R5 = byte
    20  	BR	indexbytebody<>(SB)
    21  
    22  #ifndef GOPPC64_power9
    23  #ifdef GOARCH_ppc64le
    24  DATA indexbytevbperm<>+0(SB)/8, $0x3830282018100800
    25  DATA indexbytevbperm<>+8(SB)/8, $0x7870686058504840
    26  #else
    27  DATA indexbytevbperm<>+0(SB)/8, $0x0008101820283038
    28  DATA indexbytevbperm<>+8(SB)/8, $0x4048505860687078
    29  #endif
    30  GLOBL indexbytevbperm<>+0(SB), RODATA, $16
    31  #endif
    32  
    33  // Some operations are endian specific, choose the correct opcode base on GOARCH.
    34  // Note, _VCZBEBB is only available on power9 and newer.
    35  #ifdef GOARCH_ppc64le
    36  #define _LDBEX	MOVDBR
    37  #define _LWBEX	MOVWBR
    38  #define _LHBEX	MOVHBR
    39  #define _VCZBEBB VCTZLSBB
    40  #else
    41  #define _LDBEX	MOVD
    42  #define _LWBEX	MOVW
    43  #define _LHBEX	MOVH
    44  #define _VCZBEBB VCLZLSBB
    45  #endif
    46  
    47  // R3 = addr of string
    48  // R4 = len of string
    49  // R5 = byte to find
    50  // On exit:
    51  // R3 = return value
    52  TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
    53  	CMPU	R4,$32
    54  
    55  #ifndef GOPPC64_power9
    56  	// Load VBPERMQ constant to reduce compare into an ordered bit mask.
    57  	MOVD	$indexbytevbperm<>+00(SB),R16
    58  	LXVD2X	(R16),V0	// Set up swap string
    59  #endif
    60  
    61  	MTVRD	R5,V1
    62  	VSPLTB	$7,V1,V1	// Replicate byte across V1
    63  
    64  	BLT	cmp16		// Jump to the small string case if it's <32 bytes.
    65  
    66  	CMP	R4,$64,CR1
    67  	MOVD	$16,R11
    68  	MOVD	R3,R8
    69  	BLT	CR1,cmp32	// Special case for length 32 - 63
    70  	MOVD	$32,R12
    71  	MOVD	$48,R6
    72  
    73  	RLDICR  $0,R4,$63-6,R9	// R9 = len &^ 63
    74  	ADD	R3,R9,R9	// R9 = &s[len &^ 63]
    75  	ANDCC	$63,R4		// (len &= 63) cmp 0.
    76  
    77  	PCALIGN	$16
    78  loop64:
    79  	LXVD2X	(R0)(R8),V2	// Scan 64 bytes at a time, starting at &s[0]
    80  	VCMPEQUBCC	V2,V1,V6
    81  	BNE	CR6,foundat0	// Match found at R8, jump out
    82  
    83  	LXVD2X	(R11)(R8),V2
    84  	VCMPEQUBCC	V2,V1,V6
    85  	BNE	CR6,foundat1	// Match found at R8+16 bytes, jump out
    86  
    87  	LXVD2X	(R12)(R8),V2
    88  	VCMPEQUBCC	V2,V1,V6
    89  	BNE	CR6,foundat2	// Match found at R8+32 bytes, jump out
    90  
    91  	LXVD2X	(R6)(R8),V2
    92  	VCMPEQUBCC	V2,V1,V6
    93  	BNE	CR6,foundat3	// Match found at R8+48 bytes, jump out
    94  
    95  	ADD	$64,R8
    96  	CMPU	R8,R9,CR1
    97  	BNE	CR1,loop64	// R8 != &s[len &^ 63]?
    98  
    99  	PCALIGN	$32
   100  	BEQ	notfound	// Is tail length 0? CR0 is set before entering loop64.
   101  
   102  	CMP	R4,$32		// Tail length >= 32, use cmp32 path.
   103  	CMP	R4,$16,CR1
   104  	BGE	cmp32
   105  
   106  	ADD	R8,R4,R9
   107  	ADD	$-16,R9
   108  	BLE	CR1,cmp64_tail_gt0
   109  
   110  cmp64_tail_gt16:	// Tail length 17 - 32
   111  	LXVD2X	(R0)(R8),V2
   112  	VCMPEQUBCC	V2,V1,V6
   113  	BNE	CR6,foundat0
   114  
   115  cmp64_tail_gt0:	// Tail length 1 - 16
   116  	MOVD	R9,R8
   117  	LXVD2X	(R0)(R9),V2
   118  	VCMPEQUBCC	V2,V1,V6
   119  	BNE	CR6,foundat0
   120  
   121  	BR	notfound
   122  
   123  cmp32:	// Length 32 - 63
   124  
   125  	// Bytes 0 - 15
   126  	LXVD2X	(R0)(R8),V2
   127  	VCMPEQUBCC	V2,V1,V6
   128  	BNE	CR6,foundat0
   129  
   130  	// Bytes 16 - 31
   131  	LXVD2X	(R8)(R11),V2
   132  	VCMPEQUBCC	V2,V1,V6
   133  	BNE	CR6,foundat1		// Match found at R8+16 bytes, jump out
   134  
   135  	BEQ	notfound		// Is length <= 32? (CR0 holds this comparison on entry to cmp32)
   136  	CMP	R4,$48
   137  
   138  	ADD	R4,R8,R9		// Compute &s[len(s)-16]
   139  	ADD	$32,R8,R8
   140  	ADD	$-16,R9,R9
   141  	ISEL	CR0GT,R8,R9,R8		// R8 = len(s) <= 48 ? R9 : R8
   142  
   143  	// Bytes 33 - 47
   144  	LXVD2X	(R0)(R8),V2
   145  	VCMPEQUBCC	V2,V1,V6
   146  	BNE	CR6,foundat0		// match found at R8+32 bytes, jump out
   147  
   148  	BLE	notfound
   149  
   150  	// Bytes 48 - 63
   151  	MOVD	R9,R8			// R9 holds the final check.
   152  	LXVD2X	(R0)(R9),V2
   153  	VCMPEQUBCC	V2,V1,V6
   154  	BNE	CR6,foundat0		// Match found at R8+48 bytes, jump out
   155  
   156  	BR	notfound
   157  
   158  // If ISA 3.0 instructions are unavailable, we need to account for the extra 16 added by CNTLZW.
   159  #ifndef GOPPC64_power9
   160  #define ADJUST_FOR_CNTLZW -16
   161  #else
   162  #define ADJUST_FOR_CNTLZW 0
   163  #endif
   164  
   165  // Now, find the index of the 16B vector the match was discovered in. If CNTLZW is used
   166  // to determine the offset into the 16B vector, it will overcount by 16. Account for it here.
   167  foundat3:
   168  	SUB	R3,R8,R3
   169  	ADD	$48+ADJUST_FOR_CNTLZW,R3
   170  	BR	vfound
   171  foundat2:
   172  	SUB	R3,R8,R3
   173  	ADD	$32+ADJUST_FOR_CNTLZW,R3
   174  	BR	vfound
   175  foundat1:
   176  	SUB	R3,R8,R3
   177  	ADD	$16+ADJUST_FOR_CNTLZW,R3
   178  	BR	vfound
   179  foundat0:
   180  	SUB	R3,R8,R3
   181  	ADD	$0+ADJUST_FOR_CNTLZW,R3
   182  vfound:
   183  	// Map equal values into a 16 bit value with earlier matches setting higher bits.
   184  #ifndef GOPPC64_power9
   185  	VBPERMQ	V6,V0,V6
   186  	MFVRD	V6,R4
   187  	CNTLZW	R4,R4
   188  #else
   189  #ifdef GOARCH_ppc64le
   190  	// Put the value back into LE ordering by swapping doublewords.
   191  	XXPERMDI	V6,V6,$2,V6
   192  #endif
   193  	_VCZBEBB	V6,R4
   194  #endif
   195  	ADD	R3,R4,R3
   196  	RET
   197  
   198  cmp16:	// Length 16 - 31
   199  	CMPU	R4,$16
   200  	ADD	R4,R3,R9
   201  	BLT	cmp8
   202  
   203  	ADD	$-16,R9,R9		// &s[len(s)-16]
   204  
   205  	// Bytes 0 - 15
   206  	LXVD2X	(R0)(R3),V2
   207  	VCMPEQUBCC	V2,V1,V6
   208  	MOVD	R3,R8
   209  	BNE	CR6,foundat0		// Match found at R8+32 bytes, jump out
   210  
   211  	BEQ	notfound
   212  
   213  	// Bytes 16 - 30
   214  	MOVD	R9,R8			// R9 holds the final check.
   215  	LXVD2X	(R0)(R9),V2
   216  	VCMPEQUBCC	V2,V1,V6
   217  	BNE	CR6,foundat0		// Match found at R8+48 bytes, jump out
   218  
   219  	BR	notfound
   220  
   221  
   222  cmp8:	// Length 8 - 15
   223  #ifdef GOPPC64_power10
   224  	// Load all the bytes into a single VSR in BE order.
   225  	SLD	$56,R4,R5
   226  	LXVLL	R3,R5,V2
   227  	// Compare and count the number which don't match.
   228  	VCMPEQUB	V2,V1,V6
   229  	VCLZLSBB	V6,R3
   230  	// If count is the number of bytes, or more. No matches are found.
   231  	CMPU	R3,R4
   232  	MOVD	$-1,R5
   233  	// Otherwise, the count is the index of the first match.
   234  	ISEL	CR0LT,R3,R5,R3
   235  	RET
   236  #else
   237  	RLDIMI	$8,R5,$48,R5	// Replicating the byte across the register.
   238  	RLDIMI	$16,R5,$32,R5
   239  	RLDIMI	$32,R5,$0,R5
   240  	CMPU	R4,$8
   241  	BLT	cmp4
   242  	MOVD	$-8,R11
   243  	ADD	$-8,R4,R4
   244  
   245  	_LDBEX	(R0)(R3),R10
   246  	_LDBEX	(R11)(R9),R11
   247  	CMPB	R10,R5,R10
   248  	CMPB	R11,R5,R11
   249  	CMPU	R10,$0
   250  	CMPU	R11,$0,CR1
   251  	CNTLZD	R10,R10
   252  	CNTLZD	R11,R11
   253  	SRD	$3,R10,R3
   254  	SRD	$3,R11,R11
   255  	BNE	found
   256  
   257  	ADD	R4,R11,R4
   258  	MOVD	$-1,R3
   259  	ISEL	CR1EQ,R3,R4,R3
   260  	RET
   261  
   262  cmp4:	// Length 4 - 7
   263  	CMPU	R4,$4
   264  	BLT	cmp2
   265  	MOVD	$-4,R11
   266  	ADD	$-4,R4,R4
   267  
   268  	_LWBEX	(R0)(R3),R10
   269  	_LWBEX	(R11)(R9),R11
   270  	CMPB	R10,R5,R10
   271  	CMPB	R11,R5,R11
   272  	CNTLZW	R10,R10
   273  	CNTLZW	R11,R11
   274  	CMPU	R10,$32
   275  	CMPU	R11,$32,CR1
   276  	SRD	$3,R10,R3
   277  	SRD	$3,R11,R11
   278  	BNE	found
   279  
   280  	ADD	R4,R11,R4
   281  	MOVD	$-1,R3
   282  	ISEL	CR1EQ,R3,R4,R3
   283  	RET
   284  
   285  cmp2:	// Length 2 - 3
   286  	CMPU	R4,$2
   287  	BLT	cmp1
   288  
   289  	_LHBEX	(R0)(R3),R10
   290  	CMPB	R10,R5,R10
   291  	SLDCC	$48,R10,R10
   292  	CNTLZD	R10,R10
   293  	SRD	$3,R10,R3
   294  	BNE	found
   295  
   296  cmp1:	// Length 1
   297  	MOVD	$-1,R3
   298  	ANDCC	$1,R4,R31
   299  	BEQ	found
   300  
   301  	MOVBZ	-1(R9),R10
   302  	CMPB	R10,R5,R10
   303  	ANDCC	$1,R10
   304  	ADD	$-1,R4
   305  	ISEL	CR0EQ,R3,R4,R3
   306  
   307  found:
   308  	RET
   309  #endif
   310  
   311  notfound:
   312  	MOVD $-1,R3
   313  	RET
   314  
   315
View as plain text