Text file src/internal/bytealg/equal_amd64.s

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "asm_amd64.h"
     7  #include "textflag.h"
     8  
     9  // memequal(a, b unsafe.Pointer, size uintptr) bool
    10  TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT,$0-25
    11  	// AX = a    (want in SI)
    12  	// BX = b    (want in DI)
    13  	// CX = size (want in BX)
    14  	CMPQ	AX, BX
    15  	JNE	neq
    16  	MOVQ	$1, AX	// return 1
    17  	RET
    18  neq:
    19  	MOVQ	AX, SI
    20  	MOVQ	BX, DI
    21  	MOVQ	CX, BX
    22  	JMP	memeqbody<>(SB)
    23  
    24  // memequal_varlen(a, b unsafe.Pointer) bool
    25  TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$0-17
    26  	// AX = a       (want in SI)
    27  	// BX = b       (want in DI)
    28  	// 8(DX) = size (want in BX)
    29  	CMPQ	AX, BX
    30  	JNE	neq
    31  	MOVQ	$1, AX	// return 1
    32  	RET
    33  neq:
    34  	MOVQ	AX, SI
    35  	MOVQ	BX, DI
    36  	MOVQ	8(DX), BX    // compiler stores size at offset 8 in the closure
    37  	JMP	memeqbody<>(SB)
    38  
    39  // Input:
    40  //   a in SI
    41  //   b in DI
    42  //   count in BX
    43  // Output:
    44  //   result in AX
    45  TEXT memeqbody<>(SB),NOSPLIT,$0-0
    46  	CMPQ	BX, $8
    47  	JB	small
    48  	CMPQ	BX, $64
    49  	JB	bigloop
    50  #ifndef hasAVX2
    51  	CMPB	internal∕cpu·X86+const_offsetX86HasAVX2(SB), $1
    52  	JE	hugeloop_avx2
    53  
    54  	// 64 bytes at a time using xmm registers
    55  	PCALIGN $16
    56  hugeloop:
    57  	CMPQ	BX, $64
    58  	JB	bigloop
    59  	MOVOU	(SI), X0
    60  	MOVOU	(DI), X1
    61  	MOVOU	16(SI), X2
    62  	MOVOU	16(DI), X3
    63  	MOVOU	32(SI), X4
    64  	MOVOU	32(DI), X5
    65  	MOVOU	48(SI), X6
    66  	MOVOU	48(DI), X7
    67  	PCMPEQB	X1, X0
    68  	PCMPEQB	X3, X2
    69  	PCMPEQB	X5, X4
    70  	PCMPEQB	X7, X6
    71  	PAND	X2, X0
    72  	PAND	X6, X4
    73  	PAND	X4, X0
    74  	PMOVMSKB X0, DX
    75  	ADDQ	$64, SI
    76  	ADDQ	$64, DI
    77  	SUBQ	$64, BX
    78  	CMPL	DX, $0xffff
    79  	JEQ	hugeloop
    80  	XORQ	AX, AX	// return 0
    81  	RET
    82  #endif
    83  
    84  	// 64 bytes at a time using ymm registers
    85  	PCALIGN $16
    86  hugeloop_avx2:
    87  	CMPQ	BX, $64
    88  	JB	bigloop_avx2
    89  	VMOVDQU	(SI), Y0
    90  	VMOVDQU	(DI), Y1
    91  	VMOVDQU	32(SI), Y2
    92  	VMOVDQU	32(DI), Y3
    93  	VPCMPEQB	Y1, Y0, Y4
    94  	VPCMPEQB	Y2, Y3, Y5
    95  	VPAND	Y4, Y5, Y6
    96  	VPMOVMSKB Y6, DX
    97  	ADDQ	$64, SI
    98  	ADDQ	$64, DI
    99  	SUBQ	$64, BX
   100  	CMPL	DX, $0xffffffff
   101  	JEQ	hugeloop_avx2
   102  	VZEROUPPER
   103  	XORQ	AX, AX	// return 0
   104  	RET
   105  
   106  bigloop_avx2:
   107  	VZEROUPPER
   108  
   109  	// 8 bytes at a time using 64-bit register
   110  	PCALIGN $16
   111  bigloop:
   112  	CMPQ	BX, $8
   113  	JBE	leftover
   114  	MOVQ	(SI), CX
   115  	MOVQ	(DI), DX
   116  	ADDQ	$8, SI
   117  	ADDQ	$8, DI
   118  	SUBQ	$8, BX
   119  	CMPQ	CX, DX
   120  	JEQ	bigloop
   121  	XORQ	AX, AX	// return 0
   122  	RET
   123  
   124  	// remaining 0-8 bytes
   125  leftover:
   126  	MOVQ	-8(SI)(BX*1), CX
   127  	MOVQ	-8(DI)(BX*1), DX
   128  	CMPQ	CX, DX
   129  	SETEQ	AX
   130  	RET
   131  
   132  small:
   133  	CMPQ	BX, $0
   134  	JEQ	equal
   135  
   136  	LEAQ	0(BX*8), CX
   137  	NEGQ	CX
   138  
   139  	CMPB	SI, $0xf8
   140  	JA	si_high
   141  
   142  	// load at SI won't cross a page boundary.
   143  	MOVQ	(SI), SI
   144  	JMP	si_finish
   145  si_high:
   146  	// address ends in 11111xxx. Load up to bytes we want, move to correct position.
   147  	MOVQ	-8(SI)(BX*1), SI
   148  	SHRQ	CX, SI
   149  si_finish:
   150  
   151  	// same for DI.
   152  	CMPB	DI, $0xf8
   153  	JA	di_high
   154  	MOVQ	(DI), DI
   155  	JMP	di_finish
   156  di_high:
   157  	MOVQ	-8(DI)(BX*1), DI
   158  	SHRQ	CX, DI
   159  di_finish:
   160  
   161  	SUBQ	SI, DI
   162  	SHLQ	CX, DI
   163  equal:
   164  	SETEQ	AX
   165  	RET
   166  

View as plain text