Text file src/internal/bytealg/index_amd64.s

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  #include "go_asm.h"
     6  #include "textflag.h"
     7  
     8  TEXT ·Index(SB),NOSPLIT,$0-56
     9  	MOVQ a_base+0(FP), DI
    10  	MOVQ a_len+8(FP), DX
    11  	MOVQ b_base+24(FP), R8
    12  	MOVQ b_len+32(FP), AX
    13  	MOVQ DI, R10
    14  	LEAQ ret+48(FP), R11
    15  	JMP  indexbody<>(SB)
    16  
    17  TEXT ·IndexString(SB),NOSPLIT,$0-40
    18  	MOVQ a_base+0(FP), DI
    19  	MOVQ a_len+8(FP), DX
    20  	MOVQ b_base+16(FP), R8
    21  	MOVQ b_len+24(FP), AX
    22  	MOVQ DI, R10
    23  	LEAQ ret+32(FP), R11
    24  	JMP  indexbody<>(SB)
    25  
    26  // AX: length of string, that we are searching for
    27  // DX: length of string, in which we are searching
    28  // DI: pointer to string, in which we are searching
    29  // R8: pointer to string, that we are searching for
    30  // R11: address, where to put return value
    31  // Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them
    32  TEXT indexbody<>(SB),NOSPLIT,$0
    33  	CMPQ AX, DX
    34  	JA fail
    35  	CMPQ DX, $16
    36  	JAE sse42
    37  no_sse42:
    38  	CMPQ AX, $2
    39  	JA   _3_or_more
    40  	MOVW (R8), R8
    41  	LEAQ -1(DI)(DX*1), DX
    42  	PCALIGN $16
    43  loop2:
    44  	MOVW (DI), SI
    45  	CMPW SI,R8
    46  	JZ success
    47  	ADDQ $1,DI
    48  	CMPQ DI,DX
    49  	JB loop2
    50  	JMP fail
    51  _3_or_more:
    52  	CMPQ AX, $3
    53  	JA   _4_or_more
    54  	MOVW 1(R8), BX
    55  	MOVW (R8), R8
    56  	LEAQ -2(DI)(DX*1), DX
    57  loop3:
    58  	MOVW (DI), SI
    59  	CMPW SI,R8
    60  	JZ   partial_success3
    61  	ADDQ $1,DI
    62  	CMPQ DI,DX
    63  	JB loop3
    64  	JMP fail
    65  partial_success3:
    66  	MOVW 1(DI), SI
    67  	CMPW SI,BX
    68  	JZ success
    69  	ADDQ $1,DI
    70  	CMPQ DI,DX
    71  	JB loop3
    72  	JMP fail
    73  _4_or_more:
    74  	CMPQ AX, $4
    75  	JA   _5_or_more
    76  	MOVL (R8), R8
    77  	LEAQ -3(DI)(DX*1), DX
    78  loop4:
    79  	MOVL (DI), SI
    80  	CMPL SI,R8
    81  	JZ   success
    82  	ADDQ $1,DI
    83  	CMPQ DI,DX
    84  	JB loop4
    85  	JMP fail
    86  _5_or_more:
    87  	CMPQ AX, $7
    88  	JA   _8_or_more
    89  	LEAQ 1(DI)(DX*1), DX
    90  	SUBQ AX, DX
    91  	MOVL -4(R8)(AX*1), BX
    92  	MOVL (R8), R8
    93  loop5to7:
    94  	MOVL (DI), SI
    95  	CMPL SI,R8
    96  	JZ   partial_success5to7
    97  	ADDQ $1,DI
    98  	CMPQ DI,DX
    99  	JB loop5to7
   100  	JMP fail
   101  partial_success5to7:
   102  	MOVL -4(AX)(DI*1), SI
   103  	CMPL SI,BX
   104  	JZ success
   105  	ADDQ $1,DI
   106  	CMPQ DI,DX
   107  	JB loop5to7
   108  	JMP fail
   109  _8_or_more:
   110  	CMPQ AX, $8
   111  	JA   _9_or_more
   112  	MOVQ (R8), R8
   113  	LEAQ -7(DI)(DX*1), DX
   114  loop8:
   115  	MOVQ (DI), SI
   116  	CMPQ SI,R8
   117  	JZ   success
   118  	ADDQ $1,DI
   119  	CMPQ DI,DX
   120  	JB loop8
   121  	JMP fail
   122  _9_or_more:
   123  	CMPQ AX, $15
   124  	JA   _16_or_more
   125  	LEAQ 1(DI)(DX*1), DX
   126  	SUBQ AX, DX
   127  	MOVQ -8(R8)(AX*1), BX
   128  	MOVQ (R8), R8
   129  loop9to15:
   130  	MOVQ (DI), SI
   131  	CMPQ SI,R8
   132  	JZ   partial_success9to15
   133  	ADDQ $1,DI
   134  	CMPQ DI,DX
   135  	JB loop9to15
   136  	JMP fail
   137  partial_success9to15:
   138  	MOVQ -8(AX)(DI*1), SI
   139  	CMPQ SI,BX
   140  	JZ success
   141  	ADDQ $1,DI
   142  	CMPQ DI,DX
   143  	JB loop9to15
   144  	JMP fail
   145  _16_or_more:
   146  	CMPQ AX, $16
   147  	JA   _17_or_more
   148  	MOVOU (R8), X1
   149  	LEAQ -15(DI)(DX*1), DX
   150  loop16:
   151  	MOVOU (DI), X2
   152  	PCMPEQB X1, X2
   153  	PMOVMSKB X2, SI
   154  	CMPQ  SI, $0xffff
   155  	JE   success
   156  	ADDQ $1,DI
   157  	CMPQ DI,DX
   158  	JB loop16
   159  	JMP fail
   160  _17_or_more:
   161  	CMPQ AX, $31
   162  	JA   _32_or_more
   163  	LEAQ 1(DI)(DX*1), DX
   164  	SUBQ AX, DX
   165  	MOVOU -16(R8)(AX*1), X0
   166  	MOVOU (R8), X1
   167  loop17to31:
   168  	MOVOU (DI), X2
   169  	PCMPEQB X1,X2
   170  	PMOVMSKB X2, SI
   171  	CMPQ  SI, $0xffff
   172  	JE   partial_success17to31
   173  	ADDQ $1,DI
   174  	CMPQ DI,DX
   175  	JB loop17to31
   176  	JMP fail
   177  partial_success17to31:
   178  	MOVOU -16(AX)(DI*1), X3
   179  	PCMPEQB X0, X3
   180  	PMOVMSKB X3, SI
   181  	CMPQ  SI, $0xffff
   182  	JE success
   183  	ADDQ $1,DI
   184  	CMPQ DI,DX
   185  	JB loop17to31
   186  	JMP fail
   187  // We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
   188  // So no need to check cpuid
   189  _32_or_more:
   190  	CMPQ AX, $32
   191  	JA   _33_to_63
   192  	VMOVDQU (R8), Y1
   193  	LEAQ -31(DI)(DX*1), DX
   194  loop32:
   195  	VMOVDQU (DI), Y2
   196  	VPCMPEQB Y1, Y2, Y3
   197  	VPMOVMSKB Y3, SI
   198  	CMPL  SI, $0xffffffff
   199  	JE   success_avx2
   200  	ADDQ $1,DI
   201  	CMPQ DI,DX
   202  	JB loop32
   203  	JMP fail_avx2
   204  _33_to_63:
   205  	LEAQ 1(DI)(DX*1), DX
   206  	SUBQ AX, DX
   207  	VMOVDQU -32(R8)(AX*1), Y0
   208  	VMOVDQU (R8), Y1
   209  loop33to63:
   210  	VMOVDQU (DI), Y2
   211  	VPCMPEQB Y1, Y2, Y3
   212  	VPMOVMSKB Y3, SI
   213  	CMPL  SI, $0xffffffff
   214  	JE   partial_success33to63
   215  	ADDQ $1,DI
   216  	CMPQ DI,DX
   217  	JB loop33to63
   218  	JMP fail_avx2
   219  partial_success33to63:
   220  	VMOVDQU -32(AX)(DI*1), Y3
   221  	VPCMPEQB Y0, Y3, Y4
   222  	VPMOVMSKB Y4, SI
   223  	CMPL  SI, $0xffffffff
   224  	JE success_avx2
   225  	ADDQ $1,DI
   226  	CMPQ DI,DX
   227  	JB loop33to63
   228  fail_avx2:
   229  	VZEROUPPER
   230  fail:
   231  	MOVQ $-1, (R11)
   232  	RET
   233  success_avx2:
   234  	VZEROUPPER
   235  	JMP success
   236  sse42:
   237  #ifndef hasSSE42
   238  	CMPB internal∕cpu·X86+const_offsetX86HasSSE42(SB), $1
   239  	JNE no_sse42
   240  #endif
   241  	CMPQ AX, $12
   242  	// PCMPESTRI is slower than normal compare,
   243  	// so using it makes sense only if we advance 4+ bytes per compare
   244  	// This value was determined experimentally and is the ~same
   245  	// on Nehalem (first with SSE42) and Haswell.
   246  	JAE _9_or_more
   247  	LEAQ 16(R8), SI
   248  	TESTW $0xff0, SI
   249  	JEQ no_sse42
   250  	MOVOU (R8), X1
   251  	LEAQ -15(DI)(DX*1), SI
   252  	MOVQ $16, R9
   253  	SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
   254  	PCALIGN $16
   255  loop_sse42:
   256  	// 0x0c means: unsigned byte compare (bits 0,1 are 00)
   257  	// for equality (bits 2,3 are 11)
   258  	// result is not masked or inverted (bits 4,5 are 00)
   259  	// and corresponds to first matching byte (bit 6 is 0)
   260  	PCMPESTRI $0x0c, (DI), X1
   261  	// CX == 16 means no match,
   262  	// CX > R9 means partial match at the end of the string,
   263  	// otherwise sep is at offset CX from X1 start
   264  	CMPQ CX, R9
   265  	JBE sse42_success
   266  	ADDQ R9, DI
   267  	CMPQ DI, SI
   268  	JB loop_sse42
   269  	PCMPESTRI $0x0c, -1(SI), X1
   270  	CMPQ CX, R9
   271  	JA fail
   272  	LEAQ -1(SI), DI
   273  sse42_success:
   274  	ADDQ CX, DI
   275  success:
   276  	SUBQ R10, DI
   277  	MOVQ DI, (R11)
   278  	RET
   279  

View as plain text