Text file src/internal/bytealg/count_ppc64x.s

     1  // Copyright 2018 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build ppc64le || ppc64
     6  
     7  #include "go_asm.h"
     8  #include "textflag.h"
     9  
    10  TEXT ·Count<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
    11  	// R3 = byte array pointer
    12  	// R4 = length
    13  	// R6 = byte to count
    14  	MTVRD	R6, V1		// move compare byte
    15  	MOVD	R6, R5
    16  	VSPLTB	$7, V1, V1	// replicate byte across V1
    17  	BR	countbytebody<>(SB)
    18  
    19  TEXT ·CountString<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-32
    20  	// R3 = byte array pointer
    21  	// R4 = length
    22  	// R5 = byte to count
    23  	MTVRD	R5, V1		// move compare byte
    24  	VSPLTB	$7, V1, V1	// replicate byte across V1
    25  	BR	countbytebody<>(SB)
    26  
    27  // R3: addr of string
    28  // R4: len of string
    29  // R5: byte to count
    30  // V1: byte to count, splatted.
    31  // On exit:
    32  // R3: return value
    33  TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
    34  	MOVD	$0, R18 // byte count
    35  
    36  #ifndef GOPPC64_power10
    37  	RLDIMI	$8, R5, $48, R5
    38  	RLDIMI	$16, R5, $32, R5
    39  	RLDIMI	$32, R5, $0, R5	// fill reg with the byte to count
    40  #endif
    41  
    42  	CMPU	R4, $32		// Check if it's a small string (<32 bytes)
    43  	BLT	tail		// Jump to the small string case
    44  	SRD	$5, R4, R20
    45  	MOVD	R20, CTR
    46  	MOVD	$16, R21
    47  	XXLXOR	V4, V4, V4
    48  	XXLXOR	V5, V5, V5
    49  
    50  	PCALIGN	$16
    51  cmploop:
    52  	LXVD2X	(R0)(R3), V0	// Count 32B per loop with two vector accumulators.
    53  	LXVD2X	(R21)(R3), V2
    54  	VCMPEQUB V2, V1, V2
    55  	VCMPEQUB V0, V1, V0
    56  	VPOPCNTD V2, V2		// A match is 0xFF or 0. Count the bits into doubleword buckets.
    57  	VPOPCNTD V0, V0
    58  	VADDUDM	V0, V4, V4	// Accumulate the popcounts. They are 8x the count.
    59  	VADDUDM	V2, V5, V5	// The count will be fixed up afterwards.
    60  	ADD	$32, R3
    61  	BDNZ	cmploop
    62  
    63  	VADDUDM	V4, V5, V5
    64  	MFVSRD	V5, R18
    65  	VSLDOI	$8, V5, V5, V5
    66  	MFVSRD	V5, R21
    67  	ADD	R21, R18, R18
    68  	ANDCC	$31, R4, R4
    69  	// Skip the tail processing if no bytes remaining.
    70  	BEQ	tail_0
    71  
    72  #ifdef GOPPC64_power10
    73  	SRD	$3, R18, R18	// Fix the vector loop count before counting the tail on P10.
    74  
    75  tail:	// Count the last 0 - 31 bytes.
    76  	CMP	R4, $16
    77  	BLE	small_tail_p10
    78  	LXV	0(R3), V0
    79  	VCMPEQUB V0, V1, V0
    80  	VCNTMBB	V0, $1, R14	// Sum the value of bit 0 of each byte of the compare into R14.
    81  	SRD	$56, R14, R14	// The result of VCNTMBB is shifted. Unshift it.
    82  	ADD	R14, R18, R18
    83  	ADD	$16, R3, R3
    84  	ANDCC	$15, R4, R4
    85  
    86  small_tail_p10:
    87  	SLD	$56, R4, R6
    88  	LXVLL	R3, R6, V0
    89  	VCMPEQUB V0, V1, V0
    90  	VCLRRB	V0, R4, V0	// If <16B being compared, clear matches of the 16-R4 bytes.
    91  	VCNTMBB	V0, $1, R14	// Sum the value of bit 0 of each byte of the compare into R14.
    92  	SRD	$56, R14, R14	// The result of VCNTMBB is shifted. Unshift it.
    93  	ADD	R14, R18, R3
    94  	RET
    95  
    96  #else
    97  tail:	// Count the last 0 - 31 bytes.
    98  	CMP	R4, $16
    99  	BLT	tail_8
   100  	MOVD	(R3), R12
   101  	MOVD	8(R3), R14
   102  	CMPB	R12, R5, R12
   103  	CMPB	R14, R5, R14
   104  	POPCNTD	R12, R12
   105  	POPCNTD	R14, R14
   106  	ADD	R12, R18, R18
   107  	ADD	R14, R18, R18
   108  	ADD	$16, R3, R3
   109  	ADD	$-16, R4, R4
   110  
   111  tail_8:	// Count the remaining 0 - 15 bytes.
   112  	CMP	R4, $8
   113  	BLT	tail_4
   114  	MOVD	(R3), R12
   115  	CMPB	R12, R5, R12
   116  	POPCNTD	R12, R12
   117  	ADD	R12, R18, R18
   118  	ADD	$8, R3, R3
   119  	ADD	$-8, R4, R4
   120  
   121  tail_4:	// Count the remaining 0 - 7 bytes.
   122  	CMP	R4, $4
   123  	BLT	tail_2
   124  	MOVWZ	(R3), R12
   125  	CMPB	R12, R5, R12
   126  	SLD	$32, R12, R12	// Remove non-participating matches.
   127  	POPCNTD	R12, R12
   128  	ADD	R12, R18, R18
   129  	ADD	$4, R3, R3
   130  	ADD	$-4, R4, R4
   131  
   132  tail_2:	// Count the remaining 0 - 3 bytes.
   133  	CMP	R4, $2
   134  	BLT	tail_1
   135  	MOVHZ	(R3), R12
   136  	CMPB	R12, R5, R12
   137  	SLD	$48, R12, R12	// Remove non-participating matches.
   138  	POPCNTD	R12, R12
   139  	ADD	R12, R18, R18
   140  	ADD	$2, R3, R3
   141  	ADD	$-2, R4, R4
   142  
   143  tail_1:	// Count the remaining 0 - 1 bytes.
   144  	CMP	R4, $1
   145  	BLT	tail_0
   146  	MOVBZ	(R3), R12
   147  	CMPB	R12, R5, R12
   148  	ANDCC	$0x8, R12, R12
   149  	ADD	R12, R18, R18
   150  #endif
   151  
   152  tail_0:	// No remaining tail to count.
   153  	SRD	$3, R18, R3	// Fixup count, it is off by 8x.
   154  	RET
   155  

View as plain text