Source file src/bufio/scan.go
1 // Copyright 2013 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package bufio 6 7 import ( 8 "bytes" 9 "errors" 10 "io" 11 "unicode/utf8" 12 ) 13 14 // Scanner provides a convenient interface for reading data such as 15 // a file of newline-delimited lines of text. Successive calls to 16 // the Scan method will step through the 'tokens' of a file, skipping 17 // the bytes between the tokens. The specification of a token is 18 // defined by a split function of type SplitFunc; the default split 19 // function breaks the input into lines with line termination stripped. Split 20 // functions are defined in this package for scanning a file into 21 // lines, bytes, UTF-8-encoded runes, and space-delimited words. The 22 // client may instead provide a custom split function. 23 // 24 // Scanning stops unrecoverably at EOF, the first I/O error, or a token too 25 // large to fit in the buffer. When a scan stops, the reader may have 26 // advanced arbitrarily far past the last token. Programs that need more 27 // control over error handling or large tokens, or must run sequential scans 28 // on a reader, should use bufio.Reader instead. 29 type Scanner struct { 30 r io.Reader // The reader provided by the client. 31 split SplitFunc // The function to split the tokens. 32 maxTokenSize int // Maximum size of a token; modified by tests. 33 token []byte // Last token returned by split. 34 buf []byte // Buffer used as argument to split. 35 start int // First non-processed byte in buf. 36 end int // End of data in buf. 37 err error // Sticky error. 38 empties int // Count of successive empty tokens. 39 scanCalled bool // Scan has been called; buffer is in use. 40 done bool // Scan has finished. 41 } 42 43 // SplitFunc is the signature of the split function used to tokenize the 44 // input. The arguments are an initial substring of the remaining unprocessed 45 // data and a flag, atEOF, that reports whether the Reader has no more data 46 // to give. The return values are the number of bytes to advance the input 47 // and the next token to return to the user, if any, plus an error, if any. 48 // 49 // Scanning stops if the function returns an error, in which case some of 50 // the input may be discarded. If that error is ErrFinalToken, scanning 51 // stops with no error. 52 // 53 // Otherwise, the Scanner advances the input. If the token is not nil, 54 // the Scanner returns it to the user. If the token is nil, the 55 // Scanner reads more data and continues scanning; if there is no more 56 // data--if atEOF was true--the Scanner returns. If the data does not 57 // yet hold a complete token, for instance if it has no newline while 58 // scanning lines, a SplitFunc can return (0, nil, nil) to signal the 59 // Scanner to read more data into the slice and try again with a 60 // longer slice starting at the same point in the input. 61 // 62 // The function is never called with an empty data slice unless atEOF 63 // is true. If atEOF is true, however, data may be non-empty and, 64 // as always, holds unprocessed text. 65 type SplitFunc func(data []byte, atEOF bool) (advance int, token []byte, err error) 66 67 // Errors returned by Scanner. 68 var ( 69 ErrTooLong = errors.New("bufio.Scanner: token too long") 70 ErrNegativeAdvance = errors.New("bufio.Scanner: SplitFunc returns negative advance count") 71 ErrAdvanceTooFar = errors.New("bufio.Scanner: SplitFunc returns advance count beyond input") 72 ErrBadReadCount = errors.New("bufio.Scanner: Read returned impossible count") 73 ) 74 75 const ( 76 // MaxScanTokenSize is the maximum size used to buffer a token 77 // unless the user provides an explicit buffer with Scanner.Buffer. 78 // The actual maximum token size may be smaller as the buffer 79 // may need to include, for instance, a newline. 80 MaxScanTokenSize = 64 * 1024 81 82 startBufSize = 4096 // Size of initial allocation for buffer. 83 ) 84 85 // NewScanner returns a new Scanner to read from r. 86 // The split function defaults to ScanLines. 87 func NewScanner(r io.Reader) *Scanner { 88 return &Scanner{ 89 r: r, 90 split: ScanLines, 91 maxTokenSize: MaxScanTokenSize, 92 } 93 } 94 95 // Err returns the first non-EOF error that was encountered by the Scanner. 96 func (s *Scanner) Err() error { 97 if s.err == io.EOF { 98 return nil 99 } 100 return s.err 101 } 102 103 // Bytes returns the most recent token generated by a call to Scan. 104 // The underlying array may point to data that will be overwritten 105 // by a subsequent call to Scan. It does no allocation. 106 func (s *Scanner) Bytes() []byte { 107 return s.token 108 } 109 110 // Text returns the most recent token generated by a call to Scan 111 // as a newly allocated string holding its bytes. 112 func (s *Scanner) Text() string { 113 return string(s.token) 114 } 115 116 // ErrFinalToken is a special sentinel error value. It is intended to be 117 // returned by a Split function to indicate that the token being delivered 118 // with the error is the last token and scanning should stop after this one. 119 // After ErrFinalToken is received by Scan, scanning stops with no error. 120 // The value is useful to stop processing early or when it is necessary to 121 // deliver a final empty token. One could achieve the same behavior 122 // with a custom error value but providing one here is tidier. 123 // See the emptyFinalToken example for a use of this value. 124 var ErrFinalToken = errors.New("final token") 125 126 // Scan advances the Scanner to the next token, which will then be 127 // available through the Bytes or Text method. It returns false when the 128 // scan stops, either by reaching the end of the input or an error. 129 // After Scan returns false, the Err method will return any error that 130 // occurred during scanning, except that if it was io.EOF, Err 131 // will return nil. 132 // Scan panics if the split function returns too many empty 133 // tokens without advancing the input. This is a common error mode for 134 // scanners. 135 func (s *Scanner) Scan() bool { 136 if s.done { 137 return false 138 } 139 s.scanCalled = true 140 // Loop until we have a token. 141 for { 142 // See if we can get a token with what we already have. 143 // If we've run out of data but have an error, give the split function 144 // a chance to recover any remaining, possibly empty token. 145 if s.end > s.start || s.err != nil { 146 advance, token, err := s.split(s.buf[s.start:s.end], s.err != nil) 147 if err != nil { 148 if err == ErrFinalToken { 149 s.token = token 150 s.done = true 151 return true 152 } 153 s.setErr(err) 154 return false 155 } 156 if !s.advance(advance) { 157 return false 158 } 159 s.token = token 160 if token != nil { 161 if s.err == nil || advance > 0 { 162 s.empties = 0 163 } else { 164 // Returning tokens not advancing input at EOF. 165 s.empties++ 166 if s.empties > maxConsecutiveEmptyReads { 167 panic("bufio.Scan: too many empty tokens without progressing") 168 } 169 } 170 return true 171 } 172 } 173 // We cannot generate a token with what we are holding. 174 // If we've already hit EOF or an I/O error, we are done. 175 if s.err != nil { 176 // Shut it down. 177 s.start = 0 178 s.end = 0 179 return false 180 } 181 // Must read more data. 182 // First, shift data to beginning of buffer if there's lots of empty space 183 // or space is needed. 184 if s.start > 0 && (s.end == len(s.buf) || s.start > len(s.buf)/2) { 185 copy(s.buf, s.buf[s.start:s.end]) 186 s.end -= s.start 187 s.start = 0 188 } 189 // Is the buffer full? If so, resize. 190 if s.end == len(s.buf) { 191 // Guarantee no overflow in the multiplication below. 192 const maxInt = int(^uint(0) >> 1) 193 if len(s.buf) >= s.maxTokenSize || len(s.buf) > maxInt/2 { 194 s.setErr(ErrTooLong) 195 return false 196 } 197 newSize := len(s.buf) * 2 198 if newSize == 0 { 199 newSize = startBufSize 200 } 201 if newSize > s.maxTokenSize { 202 newSize = s.maxTokenSize 203 } 204 newBuf := make([]byte, newSize) 205 copy(newBuf, s.buf[s.start:s.end]) 206 s.buf = newBuf 207 s.end -= s.start 208 s.start = 0 209 } 210 // Finally we can read some input. Make sure we don't get stuck with 211 // a misbehaving Reader. Officially we don't need to do this, but let's 212 // be extra careful: Scanner is for safe, simple jobs. 213 for loop := 0; ; { 214 n, err := s.r.Read(s.buf[s.end:len(s.buf)]) 215 if n < 0 || len(s.buf)-s.end < n { 216 s.setErr(ErrBadReadCount) 217 break 218 } 219 s.end += n 220 if err != nil { 221 s.setErr(err) 222 break 223 } 224 if n > 0 { 225 s.empties = 0 226 break 227 } 228 loop++ 229 if loop > maxConsecutiveEmptyReads { 230 s.setErr(io.ErrNoProgress) 231 break 232 } 233 } 234 } 235 } 236 237 // advance consumes n bytes of the buffer. It reports whether the advance was legal. 238 func (s *Scanner) advance(n int) bool { 239 if n < 0 { 240 s.setErr(ErrNegativeAdvance) 241 return false 242 } 243 if n > s.end-s.start { 244 s.setErr(ErrAdvanceTooFar) 245 return false 246 } 247 s.start += n 248 return true 249 } 250 251 // setErr records the first error encountered. 252 func (s *Scanner) setErr(err error) { 253 if s.err == nil || s.err == io.EOF { 254 s.err = err 255 } 256 } 257 258 // Buffer sets the initial buffer to use when scanning and the maximum 259 // size of buffer that may be allocated during scanning. The maximum 260 // token size is the larger of max and cap(buf). If max <= cap(buf), 261 // Scan will use this buffer only and do no allocation. 262 // 263 // By default, Scan uses an internal buffer and sets the 264 // maximum token size to MaxScanTokenSize. 265 // 266 // Buffer panics if it is called after scanning has started. 267 func (s *Scanner) Buffer(buf []byte, max int) { 268 if s.scanCalled { 269 panic("Buffer called after Scan") 270 } 271 s.buf = buf[0:cap(buf)] 272 s.maxTokenSize = max 273 } 274 275 // Split sets the split function for the Scanner. 276 // The default split function is ScanLines. 277 // 278 // Split panics if it is called after scanning has started. 279 func (s *Scanner) Split(split SplitFunc) { 280 if s.scanCalled { 281 panic("Split called after Scan") 282 } 283 s.split = split 284 } 285 286 // Split functions 287 288 // ScanBytes is a split function for a Scanner that returns each byte as a token. 289 func ScanBytes(data []byte, atEOF bool) (advance int, token []byte, err error) { 290 if atEOF && len(data) == 0 { 291 return 0, nil, nil 292 } 293 return 1, data[0:1], nil 294 } 295 296 var errorRune = []byte(string(utf8.RuneError)) 297 298 // ScanRunes is a split function for a Scanner that returns each 299 // UTF-8-encoded rune as a token. The sequence of runes returned is 300 // equivalent to that from a range loop over the input as a string, which 301 // means that erroneous UTF-8 encodings translate to U+FFFD = "\xef\xbf\xbd". 302 // Because of the Scan interface, this makes it impossible for the client to 303 // distinguish correctly encoded replacement runes from encoding errors. 304 func ScanRunes(data []byte, atEOF bool) (advance int, token []byte, err error) { 305 if atEOF && len(data) == 0 { 306 return 0, nil, nil 307 } 308 309 // Fast path 1: ASCII. 310 if data[0] < utf8.RuneSelf { 311 return 1, data[0:1], nil 312 } 313 314 // Fast path 2: Correct UTF-8 decode without error. 315 _, width := utf8.DecodeRune(data) 316 if width > 1 { 317 // It's a valid encoding. Width cannot be one for a correctly encoded 318 // non-ASCII rune. 319 return width, data[0:width], nil 320 } 321 322 // We know it's an error: we have width==1 and implicitly r==utf8.RuneError. 323 // Is the error because there wasn't a full rune to be decoded? 324 // FullRune distinguishes correctly between erroneous and incomplete encodings. 325 if !atEOF && !utf8.FullRune(data) { 326 // Incomplete; get more bytes. 327 return 0, nil, nil 328 } 329 330 // We have a real UTF-8 encoding error. Return a properly encoded error rune 331 // but advance only one byte. This matches the behavior of a range loop over 332 // an incorrectly encoded string. 333 return 1, errorRune, nil 334 } 335 336 // dropCR drops a terminal \r from the data. 337 func dropCR(data []byte) []byte { 338 if len(data) > 0 && data[len(data)-1] == '\r' { 339 return data[0 : len(data)-1] 340 } 341 return data 342 } 343 344 // ScanLines is a split function for a Scanner that returns each line of 345 // text, stripped of any trailing end-of-line marker. The returned line may 346 // be empty. The end-of-line marker is one optional carriage return followed 347 // by one mandatory newline. In regular expression notation, it is `\r?\n`. 348 // The last non-empty line of input will be returned even if it has no 349 // newline. 350 func ScanLines(data []byte, atEOF bool) (advance int, token []byte, err error) { 351 if atEOF && len(data) == 0 { 352 return 0, nil, nil 353 } 354 if i := bytes.IndexByte(data, '\n'); i >= 0 { 355 // We have a full newline-terminated line. 356 return i + 1, dropCR(data[0:i]), nil 357 } 358 // If we're at EOF, we have a final, non-terminated line. Return it. 359 if atEOF { 360 return len(data), dropCR(data), nil 361 } 362 // Request more data. 363 return 0, nil, nil 364 } 365 366 // isSpace reports whether the character is a Unicode white space character. 367 // We avoid dependency on the unicode package, but check validity of the implementation 368 // in the tests. 369 func isSpace(r rune) bool { 370 if r <= '\u00FF' { 371 // Obvious ASCII ones: \t through \r plus space. Plus two Latin-1 oddballs. 372 switch r { 373 case ' ', '\t', '\n', '\v', '\f', '\r': 374 return true 375 case '\u0085', '\u00A0': 376 return true 377 } 378 return false 379 } 380 // High-valued ones. 381 if '\u2000' <= r && r <= '\u200a' { 382 return true 383 } 384 switch r { 385 case '\u1680', '\u2028', '\u2029', '\u202f', '\u205f', '\u3000': 386 return true 387 } 388 return false 389 } 390 391 // ScanWords is a split function for a Scanner that returns each 392 // space-separated word of text, with surrounding spaces deleted. It will 393 // never return an empty string. The definition of space is set by 394 // unicode.IsSpace. 395 func ScanWords(data []byte, atEOF bool) (advance int, token []byte, err error) { 396 // Skip leading spaces. 397 start := 0 398 for width := 0; start < len(data); start += width { 399 var r rune 400 r, width = utf8.DecodeRune(data[start:]) 401 if !isSpace(r) { 402 break 403 } 404 } 405 // Scan until space, marking end of word. 406 for width, i := 0, start; i < len(data); i += width { 407 var r rune 408 r, width = utf8.DecodeRune(data[i:]) 409 if isSpace(r) { 410 return i + width, data[start:i], nil 411 } 412 } 413 // If we're at EOF, we have a final, non-empty, non-terminated word. Return it. 414 if atEOF && len(data) > start { 415 return len(data), data[start:], nil 416 } 417 // Request more data. 418 return start, nil, nil 419 } 420