// Copyright 2018 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. //go:build ppc64le || ppc64 #include "go_asm.h" #include "textflag.h" TEXT ·Count(SB),NOSPLIT|NOFRAME,$0-40 // R3 = byte array pointer // R4 = length // R6 = byte to count MTVRD R6, V1 // move compare byte MOVD R6, R5 VSPLTB $7, V1, V1 // replicate byte across V1 BR countbytebody<>(SB) TEXT ·CountString(SB), NOSPLIT|NOFRAME, $0-32 // R3 = byte array pointer // R4 = length // R5 = byte to count MTVRD R5, V1 // move compare byte VSPLTB $7, V1, V1 // replicate byte across V1 BR countbytebody<>(SB) // R3: addr of string // R4: len of string // R5: byte to count // V1: byte to count, splatted. // On exit: // R3: return value TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0 MOVD $0, R18 // byte count #ifndef GOPPC64_power10 RLDIMI $8, R5, $48, R5 RLDIMI $16, R5, $32, R5 RLDIMI $32, R5, $0, R5 // fill reg with the byte to count #endif CMPU R4, $32 // Check if it's a small string (<32 bytes) BLT tail // Jump to the small string case SRD $5, R4, R20 MOVD R20, CTR MOVD $16, R21 XXLXOR V4, V4, V4 XXLXOR V5, V5, V5 PCALIGN $16 cmploop: LXVD2X (R0)(R3), V0 // Count 32B per loop with two vector accumulators. LXVD2X (R21)(R3), V2 VCMPEQUB V2, V1, V2 VCMPEQUB V0, V1, V0 VPOPCNTD V2, V2 // A match is 0xFF or 0. Count the bits into doubleword buckets. VPOPCNTD V0, V0 VADDUDM V0, V4, V4 // Accumulate the popcounts. They are 8x the count. VADDUDM V2, V5, V5 // The count will be fixed up afterwards. ADD $32, R3 BDNZ cmploop VADDUDM V4, V5, V5 MFVSRD V5, R18 VSLDOI $8, V5, V5, V5 MFVSRD V5, R21 ADD R21, R18, R18 ANDCC $31, R4, R4 // Skip the tail processing if no bytes remaining. BEQ tail_0 #ifdef GOPPC64_power10 SRD $3, R18, R18 // Fix the vector loop count before counting the tail on P10. tail: // Count the last 0 - 31 bytes. CMP R4, $16 BLE small_tail_p10 LXV 0(R3), V0 VCMPEQUB V0, V1, V0 VCNTMBB V0, $1, R14 // Sum the value of bit 0 of each byte of the compare into R14. SRD $56, R14, R14 // The result of VCNTMBB is shifted. Unshift it. ADD R14, R18, R18 ADD $16, R3, R3 ANDCC $15, R4, R4 small_tail_p10: SLD $56, R4, R6 LXVLL R3, R6, V0 VCMPEQUB V0, V1, V0 VCLRRB V0, R4, V0 // If <16B being compared, clear matches of the 16-R4 bytes. VCNTMBB V0, $1, R14 // Sum the value of bit 0 of each byte of the compare into R14. SRD $56, R14, R14 // The result of VCNTMBB is shifted. Unshift it. ADD R14, R18, R3 RET #else tail: // Count the last 0 - 31 bytes. CMP R4, $16 BLT tail_8 MOVD (R3), R12 MOVD 8(R3), R14 CMPB R12, R5, R12 CMPB R14, R5, R14 POPCNTD R12, R12 POPCNTD R14, R14 ADD R12, R18, R18 ADD R14, R18, R18 ADD $16, R3, R3 ADD $-16, R4, R4 tail_8: // Count the remaining 0 - 15 bytes. CMP R4, $8 BLT tail_4 MOVD (R3), R12 CMPB R12, R5, R12 POPCNTD R12, R12 ADD R12, R18, R18 ADD $8, R3, R3 ADD $-8, R4, R4 tail_4: // Count the remaining 0 - 7 bytes. CMP R4, $4 BLT tail_2 MOVWZ (R3), R12 CMPB R12, R5, R12 SLD $32, R12, R12 // Remove non-participating matches. POPCNTD R12, R12 ADD R12, R18, R18 ADD $4, R3, R3 ADD $-4, R4, R4 tail_2: // Count the remaining 0 - 3 bytes. CMP R4, $2 BLT tail_1 MOVHZ (R3), R12 CMPB R12, R5, R12 SLD $48, R12, R12 // Remove non-participating matches. POPCNTD R12, R12 ADD R12, R18, R18 ADD $2, R3, R3 ADD $-2, R4, R4 tail_1: // Count the remaining 0 - 1 bytes. CMP R4, $1 BLT tail_0 MOVBZ (R3), R12 CMPB R12, R5, R12 ANDCC $0x8, R12, R12 ADD R12, R18, R18 #endif tail_0: // No remaining tail to count. SRD $3, R18, R3 // Fixup count, it is off by 8x. RET