compare_ppc64x.s

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build ppc64 || ppc64le
     6  
     7  #include "go_asm.h"
     8  #include "textflag.h"
     9  
    10  // Helper names for x-form loads in BE ordering.
    11  #ifdef  GOARCH_ppc64le
    12  #define _LDBEX	MOVDBR
    13  #define _LWBEX	MOVWBR
    14  #define _LHBEX	MOVHBR
    15  #else
    16  #define _LDBEX	MOVD
    17  #define _LWBEX	MOVW
    18  #define _LHBEX	MOVH
    19  #endif
    20  
    21  #ifdef GOPPC64_power9
    22  #define SETB_CR0(rout) SETB CR0, rout
    23  #define SETB_CR1(rout) SETB CR1, rout
    24  #define SETB_INIT()
    25  #define SETB_CR0_NE(rout) SETB_CR0(rout)
    26  #else
    27  // A helper macro to emulate SETB on P8. This assumes
    28  // -1 is in R20, and 1 is in R21. crxlt and crxeq must
    29  // also be the same CR field.
    30  #define _SETB(crxlt, crxeq, rout) \
    31  	ISEL	crxeq,R0,R21,rout \
    32  	ISEL	crxlt,R20,rout,rout
    33  
    34  // A special case when it is know the comparison
    35  // will always be not equal. The result must be -1 or 1.
    36  #define SETB_CR0_NE(rout) \
    37  	ISEL	CR0LT,R20,R21,rout
    38  
    39  #define SETB_CR0(rout) _SETB(CR0LT, CR0EQ, rout)
    40  #define SETB_CR1(rout) _SETB(CR1LT, CR1EQ, rout)
    41  #define SETB_INIT() \
    42  	MOVD	$-1,R20 \
    43  	MOVD	$1,R21
    44  #endif
    45  
    46  TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
    47  	// incoming:
    48  	// R3 a addr
    49  	// R4 a len
    50  	// R6 b addr
    51  	// R7 b len
    52  	//
    53  	// on entry to cmpbody:
    54  	// R3 return value if len(a) == len(b)
    55  	// R5 a addr
    56  	// R6 b addr
    57  	// R9 min(len(a),len(b))
    58  	SETB_INIT()
    59  	MOVD	R3,R5
    60  	CMP	R4,R7,CR0
    61  	CMP	R3,R6,CR7
    62  	ISEL	CR0LT,R4,R7,R9
    63  	SETB_CR0(R3)
    64  	BEQ	CR7,LR
    65  	BR	cmpbody<>(SB)
    66  
    67  TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
    68  	// incoming:
    69  	// R3 a addr -> R5
    70  	// R4 a len  -> R3
    71  	// R5 b addr -> R6
    72  	// R6 b len  -> R4
    73  	//
    74  	// on entry to cmpbody:
    75  	// R3 compare value if compared length is same.
    76  	// R5 a addr
    77  	// R6 b addr
    78  	// R9 min(len(a),len(b))
    79  	SETB_INIT()
    80  	CMP	R4,R6,CR0
    81  	CMP	R3,R5,CR7
    82  	ISEL	CR0LT,R4,R6,R9
    83  	MOVD	R5,R6
    84  	MOVD	R3,R5
    85  	SETB_CR0(R3)
    86  	BEQ	CR7,LR
    87  	BR	cmpbody<>(SB)
    88  
    89  #ifdef GOARCH_ppc64le
    90  DATA byteswap<>+0(SB)/8, $0x0706050403020100
    91  DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908
    92  GLOBL byteswap<>+0(SB), RODATA, $16
    93  #define SWAP V21
    94  #endif
    95  
    96  TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
    97  start:
    98  	CMP	R9,$16,CR0
    99  	CMP	R9,$32,CR1
   100  	CMP	R9,$64,CR2
   101  	MOVD	$16,R10
   102  	BLT	cmp8
   103  	BLT	CR1,cmp16
   104  	BLT	CR2,cmp32
   105  
   106  cmp64:	// >= 64B
   107  	DCBT	(R5)		// optimize for size>=64
   108  	DCBT	(R6)		// cache hint
   109  
   110  	SRD	$6,R9,R14	// There is at least one iteration.
   111  	MOVD	R14,CTR
   112  	ANDCC   $63,R9,R9
   113  	CMP	R9,$16,CR1	// Do setup for tail check early on.
   114  	CMP	R9,$32,CR2
   115  	CMP	R9,$48,CR3
   116  	ADD	$-16,R9,R9
   117  
   118  	MOVD	$32,R11		// set offsets to load into vector
   119  	MOVD	$48,R12		// set offsets to load into vector
   120  
   121  	PCALIGN	$16
   122  cmp64_loop:
   123  	LXVD2X	(R5)(R0),V3	// load bytes of A at offset 0 into vector
   124  	LXVD2X	(R6)(R0),V4	// load bytes of B at offset 0 into vector
   125  	VCMPEQUDCC	V3,V4,V1
   126  	BGE	CR6,different	// jump out if its different
   127  
   128  	LXVD2X	(R5)(R10),V3	// load bytes of A at offset 16 into vector
   129  	LXVD2X	(R6)(R10),V4	// load bytes of B at offset 16 into vector
   130  	VCMPEQUDCC	V3,V4,V1
   131  	BGE	CR6,different
   132  
   133  	LXVD2X	(R5)(R11),V3	// load bytes of A at offset 32 into vector
   134  	LXVD2X	(R6)(R11),V4	// load bytes of B at offset 32 into vector
   135  	VCMPEQUDCC	V3,V4,V1
   136  	BGE	CR6,different
   137  
   138  	LXVD2X	(R5)(R12),V3	// load bytes of A at offset 64 into vector
   139  	LXVD2X	(R6)(R12),V4	// load bytes of B at offset 64 into vector
   140  	VCMPEQUDCC	V3,V4,V1
   141  	BGE	CR6,different
   142  
   143  	ADD	$64,R5,R5	// increment to next 64 bytes of A
   144  	ADD	$64,R6,R6	// increment to next 64 bytes of B
   145  	BDNZ	cmp64_loop
   146  	BEQ	CR0,LR		// beqlr
   147  
   148  	// Finish out tail with minimal overlapped checking.
   149  	// Note, 0 tail is handled by beqlr above.
   150  	BLE	CR1,cmp64_tail_gt0
   151  	BLE	CR2,cmp64_tail_gt16
   152  	BLE	CR3,cmp64_tail_gt32
   153  
   154  cmp64_tail_gt48: // 49 - 63 B
   155  	LXVD2X	(R0)(R5),V3
   156  	LXVD2X	(R0)(R6),V4
   157  	VCMPEQUDCC	V3,V4,V1
   158  	BGE	CR6,different
   159  
   160  	LXVD2X	(R5)(R10),V3
   161  	LXVD2X	(R6)(R10),V4
   162  	VCMPEQUDCC	V3,V4,V1
   163  	BGE	CR6,different
   164  
   165  	LXVD2X	(R5)(R11),V3
   166  	LXVD2X	(R6)(R11),V4
   167  	VCMPEQUDCC	V3,V4,V1
   168  	BGE	CR6,different
   169  
   170  	BR cmp64_tail_gt0
   171  
   172  	PCALIGN $16
   173  cmp64_tail_gt32: // 33 - 48B
   174  	LXVD2X	(R0)(R5),V3
   175  	LXVD2X	(R0)(R6),V4
   176  	VCMPEQUDCC	V3,V4,V1
   177  	BGE	CR6,different
   178  
   179  	LXVD2X	(R5)(R10),V3
   180  	LXVD2X	(R6)(R10),V4
   181  	VCMPEQUDCC	V3,V4,V1
   182  	BGE	CR6,different
   183  
   184  	BR cmp64_tail_gt0
   185  
   186  	PCALIGN $16
   187  cmp64_tail_gt16: // 17 - 32B
   188  	LXVD2X	(R0)(R5),V3
   189  	LXVD2X	(R0)(R6),V4
   190  	VCMPEQUDCC	V3,V4,V1
   191  	BGE	CR6,different
   192  
   193  	BR cmp64_tail_gt0
   194  
   195  	PCALIGN $16
   196  cmp64_tail_gt0: // 1 - 16B
   197  	LXVD2X	(R5)(R9),V3
   198  	LXVD2X	(R6)(R9),V4
   199  	VCMPEQUDCC	V3,V4,V1
   200  	BGE	CR6,different
   201  
   202  	RET
   203  
   204  	PCALIGN $16
   205  cmp32:	// 32 - 63B
   206  	ANDCC	$31,R9,R9
   207  
   208  	LXVD2X	(R0)(R5),V3
   209  	LXVD2X	(R0)(R6),V4
   210  	VCMPEQUDCC	V3,V4,V1
   211  	BGE	CR6,different
   212  
   213  	LXVD2X	(R10)(R5),V3
   214  	LXVD2X	(R10)(R6),V4
   215  	VCMPEQUDCC	V3,V4,V1
   216  	BGE	CR6,different
   217  
   218  	BEQ	CR0,LR
   219  	ADD	R9,R10,R10
   220  
   221  	LXVD2X	(R9)(R5),V3
   222  	LXVD2X	(R9)(R6),V4
   223  	VCMPEQUDCC	V3,V4,V1
   224  	BGE	CR6,different
   225  
   226  	LXVD2X	(R10)(R5),V3
   227  	LXVD2X	(R10)(R6),V4
   228  	VCMPEQUDCC	V3,V4,V1
   229  	BGE	CR6,different
   230  	RET
   231  
   232  	PCALIGN $16
   233  cmp16:	// 16 - 31B
   234  	ANDCC	$15,R9,R9
   235  	LXVD2X	(R0)(R5),V3
   236  	LXVD2X	(R0)(R6),V4
   237  	VCMPEQUDCC	V3,V4,V1
   238  	BGE	CR6,different
   239  	BEQ	CR0,LR
   240  
   241  	LXVD2X	(R9)(R5),V3
   242  	LXVD2X	(R9)(R6),V4
   243  	VCMPEQUDCC	V3,V4,V1
   244  	BGE	CR6,different
   245  	RET
   246  
   247  	PCALIGN $16
   248  different:
   249  #ifdef	GOARCH_ppc64le
   250  	MOVD	$byteswap<>+00(SB),R16
   251  	LXVD2X	(R16)(R0),SWAP	// Set up swap string
   252  
   253  	VPERM	V3,V3,SWAP,V3
   254  	VPERM	V4,V4,SWAP,V4
   255  #endif
   256  
   257  	MFVSRD	VS35,R16	// move upper doublewords of A and B into GPR for comparison
   258  	MFVSRD	VS36,R10
   259  
   260  	CMPU	R16,R10
   261  	BEQ	lower
   262  	SETB_CR0_NE(R3)
   263  	RET
   264  
   265  	PCALIGN $16
   266  lower:
   267  	VSLDOI	$8,V3,V3,V3	// move lower doublewords of A and B into GPR for comparison
   268  	MFVSRD	VS35,R16
   269  	VSLDOI	$8,V4,V4,V4
   270  	MFVSRD	VS36,R10
   271  
   272  	CMPU	R16,R10
   273  	SETB_CR0_NE(R3)
   274  	RET
   275  
   276  	PCALIGN $16
   277  cmp8:	// 8 - 15B (0 - 15B if GOPPC64_power10)
   278  #ifdef GOPPC64_power10
   279  	SLD	$56,R9,R9
   280  	LXVLL	R5,R9,V3	// Load bytes starting from MSB to LSB, unused are zero filled.
   281  	LXVLL	R6,R9,V4
   282  	VCMPUQ	V3,V4,CR0	// Compare as a 128b integer.
   283  	SETB_CR0(R6)
   284  	ISEL	CR0EQ,R3,R6,R3	// If equal, length determines the return value.
   285  	RET
   286  #else
   287  	CMP	R9,$8
   288  	BLT	cmp4
   289  	ANDCC	$7,R9,R9
   290  	_LDBEX	(R0)(R5),R10
   291  	_LDBEX	(R0)(R6),R11
   292  	_LDBEX	(R9)(R5),R12
   293  	_LDBEX	(R9)(R6),R14
   294  	CMPU	R10,R11,CR0
   295  	SETB_CR0(R5)
   296  	CMPU	R12,R14,CR1
   297  	SETB_CR1(R6)
   298  	CRAND   CR0EQ,CR1EQ,CR1EQ // If both equal, length determines return value.
   299  	ISEL	CR0EQ,R6,R5,R4
   300  	ISEL	CR1EQ,R3,R4,R3
   301  	RET
   302  
   303  	PCALIGN	$16
   304  cmp4:	// 4 - 7B
   305  	CMP	R9,$4
   306  	BLT	cmp2
   307  	ANDCC	$3,R9,R9
   308  	_LWBEX	(R0)(R5),R10
   309  	_LWBEX	(R0)(R6),R11
   310  	_LWBEX	(R9)(R5),R12
   311  	_LWBEX	(R9)(R6),R14
   312  	RLDIMI	$32,R10,$0,R12
   313  	RLDIMI	$32,R11,$0,R14
   314  	CMPU	R12,R14
   315  	BR	cmp0
   316  
   317  	PCALIGN $16
   318  cmp2:	// 2 - 3B
   319  	CMP	R9,$2
   320  	BLT	cmp1
   321  	ANDCC	$1,R9,R9
   322  	_LHBEX	(R0)(R5),R10
   323  	_LHBEX	(R0)(R6),R11
   324  	_LHBEX	(R9)(R5),R12
   325  	_LHBEX	(R9)(R6),R14
   326  	RLDIMI	$32,R10,$0,R12
   327  	RLDIMI	$32,R11,$0,R14
   328  	CMPU	R12,R14
   329  	BR	cmp0
   330  
   331  	PCALIGN $16
   332  cmp1:
   333  	CMP	R9,$0
   334  	BEQ	cmp0
   335  	MOVBZ	(R5),R10
   336  	MOVBZ	(R6),R11
   337  	CMPU	R10,R11
   338  cmp0:
   339  	SETB_CR0(R6)
   340  	ISEL	CR0EQ,R3,R6,R3
   341  	RET
   342  #endif
   343
View as plain text