scan.go

     1  // Copyright 2013 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package bufio
     6  
     7  import (
     8  	"bytes"
     9  	"errors"
    10  	"io"
    11  	"unicode/utf8"
    12  )
    13  
    14  // Scanner provides a convenient interface for reading data such as
    15  // a file of newline-delimited lines of text. Successive calls to
    16  // the [Scanner.Scan] method will step through the 'tokens' of a file, skipping
    17  // the bytes between the tokens. The specification of a token is
    18  // defined by a split function of type [SplitFunc]; the default split
    19  // function breaks the input into lines with line termination stripped. [Scanner.Split]
    20  // functions are defined in this package for scanning a file into
    21  // lines, bytes, UTF-8-encoded runes, and space-delimited words. The
    22  // client may instead provide a custom split function.
    23  //
    24  // Scanning stops unrecoverably at EOF, the first I/O error, or a token too
    25  // large to fit in the [Scanner.Buffer]. When a scan stops, the reader may have
    26  // advanced arbitrarily far past the last token. Programs that need more
    27  // control over error handling or large tokens, or must run sequential scans
    28  // on a reader, should use [bufio.Reader] instead.
    29  type Scanner struct {
    30  	r            io.Reader // The reader provided by the client.
    31  	split        SplitFunc // The function to split the tokens.
    32  	maxTokenSize int       // Maximum size of a token; modified by tests.
    33  	token        []byte    // Last token returned by split.
    34  	buf          []byte    // Buffer used as argument to split.
    35  	start        int       // First non-processed byte in buf.
    36  	end          int       // End of data in buf.
    37  	err          error     // Sticky error.
    38  	empties      int       // Count of successive empty tokens.
    39  	scanCalled   bool      // Scan has been called; buffer is in use.
    40  	done         bool      // Scan has finished.
    41  }
    42  
    43  // SplitFunc is the signature of the split function used to tokenize the
    44  // input. The arguments are an initial substring of the remaining unprocessed
    45  // data and a flag, atEOF, that reports whether the [Reader] has no more data
    46  // to give. The return values are the number of bytes to advance the input
    47  // and the next token to return to the user, if any, plus an error, if any.
    48  //
    49  // Scanning stops if the function returns an error, in which case some of
    50  // the input may be discarded. If that error is [ErrFinalToken], scanning
    51  // stops with no error. A non-nil token delivered with [ErrFinalToken]
    52  // will be the last token, and a nil token with [ErrFinalToken]
    53  // immediately stops the scanning.
    54  //
    55  // Otherwise, the [Scanner] advances the input. If the token is not nil,
    56  // the [Scanner] returns it to the user. If the token is nil, the
    57  // Scanner reads more data and continues scanning; if there is no more
    58  // data--if atEOF was true--the [Scanner] returns. If the data does not
    59  // yet hold a complete token, for instance if it has no newline while
    60  // scanning lines, a [SplitFunc] can return (0, nil, nil) to signal the
    61  // [Scanner] to read more data into the slice and try again with a
    62  // longer slice starting at the same point in the input.
    63  //
    64  // The function is never called with an empty data slice unless atEOF
    65  // is true. If atEOF is true, however, data may be non-empty and,
    66  // as always, holds unprocessed text.
    67  type SplitFunc func(data []byte, atEOF bool) (advance int, token []byte, err error)
    68  
    69  // Errors returned by Scanner.
    70  var (
    71  	ErrTooLong         = errors.New("bufio.Scanner: token too long")
    72  	ErrNegativeAdvance = errors.New("bufio.Scanner: SplitFunc returns negative advance count")
    73  	ErrAdvanceTooFar   = errors.New("bufio.Scanner: SplitFunc returns advance count beyond input")
    74  	ErrBadReadCount    = errors.New("bufio.Scanner: Read returned impossible count")
    75  )
    76  
    77  const (
    78  	// MaxScanTokenSize is the maximum size used to buffer a token
    79  	// unless the user provides an explicit buffer with [Scanner.Buffer].
    80  	// The actual maximum token size may be smaller as the buffer
    81  	// may need to include, for instance, a newline.
    82  	MaxScanTokenSize = 64 * 1024
    83  
    84  	startBufSize = 4096 // Size of initial allocation for buffer.
    85  )
    86  
    87  // NewScanner returns a new [Scanner] to read from r.
    88  // The split function defaults to [ScanLines].
    89  func NewScanner(r io.Reader) *Scanner {
    90  	return &Scanner{
    91  		r:            r,
    92  		split:        ScanLines,
    93  		maxTokenSize: MaxScanTokenSize,
    94  	}
    95  }
    96  
    97  // Err returns the first non-EOF error that was encountered by the [Scanner].
    98  func (s *Scanner) Err() error {
    99  	if s.err == io.EOF {
   100  		return nil
   101  	}
   102  	return s.err
   103  }
   104  
   105  // Bytes returns the most recent token generated by a call to [Scanner.Scan].
   106  // The underlying array may point to data that will be overwritten
   107  // by a subsequent call to Scan. It does no allocation.
   108  func (s *Scanner) Bytes() []byte {
   109  	return s.token
   110  }
   111  
   112  // Text returns the most recent token generated by a call to [Scanner.Scan]
   113  // as a newly allocated string holding its bytes.
   114  func (s *Scanner) Text() string {
   115  	return string(s.token)
   116  }
   117  
   118  // ErrFinalToken is a special sentinel error value. It is intended to be
   119  // returned by a Split function to indicate that the scanning should stop
   120  // with no error. If the token being delivered with this error is not nil,
   121  // the token is the last token.
   122  //
   123  // The value is useful to stop processing early or when it is necessary to
   124  // deliver a final empty token (which is different from a nil token).
   125  // One could achieve the same behavior with a custom error value but
   126  // providing one here is tidier.
   127  // See the emptyFinalToken example for a use of this value.
   128  var ErrFinalToken = errors.New("final token")
   129  
   130  // Scan advances the [Scanner] to the next token, which will then be
   131  // available through the [Scanner.Bytes] or [Scanner.Text] method. It returns false when
   132  // there are no more tokens, either by reaching the end of the input or an error.
   133  // After Scan returns false, the [Scanner.Err] method will return any error that
   134  // occurred during scanning, except that if it was [io.EOF], [Scanner.Err]
   135  // will return nil.
   136  // Scan panics if the split function returns too many empty
   137  // tokens without advancing the input. This is a common error mode for
   138  // scanners.
   139  func (s *Scanner) Scan() bool {
   140  	if s.done {
   141  		return false
   142  	}
   143  	s.scanCalled = true
   144  	// Loop until we have a token.
   145  	for {
   146  		// See if we can get a token with what we already have.
   147  		// If we've run out of data but have an error, give the split function
   148  		// a chance to recover any remaining, possibly empty token.
   149  		if s.end > s.start || s.err != nil {
   150  			advance, token, err := s.split(s.buf[s.start:s.end], s.err != nil)
   151  			if err != nil {
   152  				if err == ErrFinalToken {
   153  					s.token = token
   154  					s.done = true
   155  					// When token is not nil, it means the scanning stops
   156  					// with a trailing token, and thus the return value
   157  					// should be true to indicate the existence of the token.
   158  					return token != nil
   159  				}
   160  				s.setErr(err)
   161  				return false
   162  			}
   163  			if !s.advance(advance) {
   164  				return false
   165  			}
   166  			s.token = token
   167  			if token != nil {
   168  				if s.err == nil || advance > 0 {
   169  					s.empties = 0
   170  				} else {
   171  					// Returning tokens not advancing input at EOF.
   172  					s.empties++
   173  					if s.empties > maxConsecutiveEmptyReads {
   174  						panic("bufio.Scan: too many empty tokens without progressing")
   175  					}
   176  				}
   177  				return true
   178  			}
   179  		}
   180  		// We cannot generate a token with what we are holding.
   181  		// If we've already hit EOF or an I/O error, we are done.
   182  		if s.err != nil {
   183  			// Shut it down.
   184  			s.start = 0
   185  			s.end = 0
   186  			return false
   187  		}
   188  		// Must read more data.
   189  		// First, shift data to beginning of buffer if there's lots of empty space
   190  		// or space is needed.
   191  		if s.start > 0 && (s.end == len(s.buf) || s.start > len(s.buf)/2) {
   192  			copy(s.buf, s.buf[s.start:s.end])
   193  			s.end -= s.start
   194  			s.start = 0
   195  		}
   196  		// Is the buffer full? If so, resize.
   197  		if s.end == len(s.buf) {
   198  			// Guarantee no overflow in the multiplication below.
   199  			const maxInt = int(^uint(0) >> 1)
   200  			if len(s.buf) >= s.maxTokenSize || len(s.buf) > maxInt/2 {
   201  				s.setErr(ErrTooLong)
   202  				return false
   203  			}
   204  			newSize := len(s.buf) * 2
   205  			if newSize == 0 {
   206  				newSize = startBufSize
   207  			}
   208  			newSize = min(newSize, s.maxTokenSize)
   209  			newBuf := make([]byte, newSize)
   210  			copy(newBuf, s.buf[s.start:s.end])
   211  			s.buf = newBuf
   212  			s.end -= s.start
   213  			s.start = 0
   214  		}
   215  		// Finally we can read some input. Make sure we don't get stuck with
   216  		// a misbehaving Reader. Officially we don't need to do this, but let's
   217  		// be extra careful: Scanner is for safe, simple jobs.
   218  		for loop := 0; ; {
   219  			n, err := s.r.Read(s.buf[s.end:len(s.buf)])
   220  			if n < 0 || len(s.buf)-s.end < n {
   221  				s.setErr(ErrBadReadCount)
   222  				break
   223  			}
   224  			s.end += n
   225  			if err != nil {
   226  				s.setErr(err)
   227  				break
   228  			}
   229  			if n > 0 {
   230  				s.empties = 0
   231  				break
   232  			}
   233  			loop++
   234  			if loop > maxConsecutiveEmptyReads {
   235  				s.setErr(io.ErrNoProgress)
   236  				break
   237  			}
   238  		}
   239  	}
   240  }
   241  
   242  // advance consumes n bytes of the buffer. It reports whether the advance was legal.
   243  func (s *Scanner) advance(n int) bool {
   244  	if n < 0 {
   245  		s.setErr(ErrNegativeAdvance)
   246  		return false
   247  	}
   248  	if n > s.end-s.start {
   249  		s.setErr(ErrAdvanceTooFar)
   250  		return false
   251  	}
   252  	s.start += n
   253  	return true
   254  }
   255  
   256  // setErr records the first error encountered.
   257  func (s *Scanner) setErr(err error) {
   258  	if s.err == nil || s.err == io.EOF {
   259  		s.err = err
   260  	}
   261  }
   262  
   263  // Buffer sets the initial buffer to use when scanning
   264  // and the maximum size of buffer that may be allocated during scanning.
   265  // The maximum token size must be less than the larger of max and cap(buf).
   266  // If max <= cap(buf), [Scanner.Scan] will use this buffer only and do no allocation.
   267  //
   268  // By default, [Scanner.Scan] uses an internal buffer and sets the
   269  // maximum token size to [MaxScanTokenSize].
   270  //
   271  // Buffer panics if it is called after scanning has started.
   272  func (s *Scanner) Buffer(buf []byte, max int) {
   273  	if s.scanCalled {
   274  		panic("Buffer called after Scan")
   275  	}
   276  	s.buf = buf[0:cap(buf)]
   277  	s.maxTokenSize = max
   278  }
   279  
   280  // Split sets the split function for the [Scanner].
   281  // The default split function is [ScanLines].
   282  //
   283  // Split panics if it is called after scanning has started.
   284  func (s *Scanner) Split(split SplitFunc) {
   285  	if s.scanCalled {
   286  		panic("Split called after Scan")
   287  	}
   288  	s.split = split
   289  }
   290  
   291  // Split functions
   292  
   293  // ScanBytes is a split function for a [Scanner] that returns each byte as a token.
   294  func ScanBytes(data []byte, atEOF bool) (advance int, token []byte, err error) {
   295  	if atEOF && len(data) == 0 {
   296  		return 0, nil, nil
   297  	}
   298  	return 1, data[0:1], nil
   299  }
   300  
   301  var errorRune = []byte(string(utf8.RuneError))
   302  
   303  // ScanRunes is a split function for a [Scanner] that returns each
   304  // UTF-8-encoded rune as a token. The sequence of runes returned is
   305  // equivalent to that from a range loop over the input as a string, which
   306  // means that erroneous UTF-8 encodings translate to U+FFFD = "\xef\xbf\xbd".
   307  // Because of the Scan interface, this makes it impossible for the client to
   308  // distinguish correctly encoded replacement runes from encoding errors.
   309  func ScanRunes(data []byte, atEOF bool) (advance int, token []byte, err error) {
   310  	if atEOF && len(data) == 0 {
   311  		return 0, nil, nil
   312  	}
   313  
   314  	// Fast path 1: ASCII.
   315  	if data[0] < utf8.RuneSelf {
   316  		return 1, data[0:1], nil
   317  	}
   318  
   319  	// Fast path 2: Correct UTF-8 decode without error.
   320  	_, width := utf8.DecodeRune(data)
   321  	if width > 1 {
   322  		// It's a valid encoding. Width cannot be one for a correctly encoded
   323  		// non-ASCII rune.
   324  		return width, data[0:width], nil
   325  	}
   326  
   327  	// We know it's an error: we have width==1 and implicitly r==utf8.RuneError.
   328  	// Is the error because there wasn't a full rune to be decoded?
   329  	// FullRune distinguishes correctly between erroneous and incomplete encodings.
   330  	if !atEOF && !utf8.FullRune(data) {
   331  		// Incomplete; get more bytes.
   332  		return 0, nil, nil
   333  	}
   334  
   335  	// We have a real UTF-8 encoding error. Return a properly encoded error rune
   336  	// but advance only one byte. This matches the behavior of a range loop over
   337  	// an incorrectly encoded string.
   338  	return 1, errorRune, nil
   339  }
   340  
   341  // dropCR drops a terminal \r from the data.
   342  func dropCR(data []byte) []byte {
   343  	if len(data) > 0 && data[len(data)-1] == '\r' {
   344  		return data[0 : len(data)-1]
   345  	}
   346  	return data
   347  }
   348  
   349  // ScanLines is a split function for a [Scanner] that returns each line of
   350  // text, stripped of any trailing end-of-line marker. The returned line may
   351  // be empty. The end-of-line marker is one optional carriage return followed
   352  // by one mandatory newline. In regular expression notation, it is `\r?\n`.
   353  // The last non-empty line of input will be returned even if it has no
   354  // newline.
   355  func ScanLines(data []byte, atEOF bool) (advance int, token []byte, err error) {
   356  	if atEOF && len(data) == 0 {
   357  		return 0, nil, nil
   358  	}
   359  	if i := bytes.IndexByte(data, '\n'); i >= 0 {
   360  		// We have a full newline-terminated line.
   361  		return i + 1, dropCR(data[0:i]), nil
   362  	}
   363  	// If we're at EOF, we have a final, non-terminated line. Return it.
   364  	if atEOF {
   365  		return len(data), dropCR(data), nil
   366  	}
   367  	// Request more data.
   368  	return 0, nil, nil
   369  }
   370  
   371  // isSpace reports whether the character is a Unicode white space character.
   372  // We avoid dependency on the unicode package, but check validity of the implementation
   373  // in the tests.
   374  func isSpace(r rune) bool {
   375  	if r <= '\u00FF' {
   376  		// Obvious ASCII ones: \t through \r plus space. Plus two Latin-1 oddballs.
   377  		switch r {
   378  		case ' ', '\t', '\n', '\v', '\f', '\r':
   379  			return true
   380  		case '\u0085', '\u00A0':
   381  			return true
   382  		}
   383  		return false
   384  	}
   385  	// High-valued ones.
   386  	if '\u2000' <= r && r <= '\u200a' {
   387  		return true
   388  	}
   389  	switch r {
   390  	case '\u1680', '\u2028', '\u2029', '\u202f', '\u205f', '\u3000':
   391  		return true
   392  	}
   393  	return false
   394  }
   395  
   396  // ScanWords is a split function for a [Scanner] that returns each
   397  // space-separated word of text, with surrounding spaces deleted. It will
   398  // never return an empty string. The definition of space is set by
   399  // unicode.IsSpace.
   400  func ScanWords(data []byte, atEOF bool) (advance int, token []byte, err error) {
   401  	// Skip leading spaces.
   402  	start := 0
   403  	for width := 0; start < len(data); start += width {
   404  		var r rune
   405  		r, width = utf8.DecodeRune(data[start:])
   406  		if !isSpace(r) {
   407  			break
   408  		}
   409  	}
   410  	// Scan until space, marking end of word.
   411  	for width, i := 0, start; i < len(data); i += width {
   412  		var r rune
   413  		r, width = utf8.DecodeRune(data[i:])
   414  		if isSpace(r) {
   415  			return i + width, data[start:i], nil
   416  		}
   417  	}
   418  	// If we're at EOF, we have a final, non-empty, non-terminated word. Return it.
   419  	if atEOF && len(data) > start {
   420  		return len(data), data[start:], nil
   421  	}
   422  	// Request more data.
   423  	return start, nil, nil
   424  }
   425
View as plain text