...

Source file src/go/scanner/scanner.go

Documentation: go/scanner

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package scanner implements a scanner for Go source text.
     6  // It takes a []byte as source which can then be tokenized
     7  // through repeated calls to the Scan method.
     8  package scanner
     9  
    10  import (
    11  	"bytes"
    12  	"fmt"
    13  	"go/token"
    14  	"path/filepath"
    15  	"strconv"
    16  	"unicode"
    17  	"unicode/utf8"
    18  )
    19  
    20  // An ErrorHandler may be provided to Scanner.Init. If a syntax error is
    21  // encountered and a handler was installed, the handler is called with a
    22  // position and an error message. The position points to the beginning of
    23  // the offending token.
    24  type ErrorHandler func(pos token.Position, msg string)
    25  
    26  // A Scanner holds the scanner's internal state while processing
    27  // a given text. It can be allocated as part of another data
    28  // structure but must be initialized via Init before use.
    29  type Scanner struct {
    30  	// immutable state
    31  	file *token.File  // source file handle
    32  	dir  string       // directory portion of file.Name()
    33  	src  []byte       // source
    34  	err  ErrorHandler // error reporting; or nil
    35  	mode Mode         // scanning mode
    36  
    37  	// scanning state
    38  	ch         rune // current character
    39  	offset     int  // character offset
    40  	rdOffset   int  // reading offset (position after current character)
    41  	lineOffset int  // current line offset
    42  	insertSemi bool // insert a semicolon before next newline
    43  
    44  	// public state - ok to modify
    45  	ErrorCount int // number of errors encountered
    46  }
    47  
    48  const (
    49  	bom = 0xFEFF // byte order mark, only permitted as very first character
    50  	eof = -1     // end of file
    51  )
    52  
    53  // Read the next Unicode char into s.ch.
    54  // s.ch < 0 means end-of-file.
    55  //
    56  // For optimization, there is some overlap between this method and
    57  // s.scanIdentifier.
    58  func (s *Scanner) next() {
    59  	if s.rdOffset < len(s.src) {
    60  		s.offset = s.rdOffset
    61  		if s.ch == '\n' {
    62  			s.lineOffset = s.offset
    63  			s.file.AddLine(s.offset)
    64  		}
    65  		r, w := rune(s.src[s.rdOffset]), 1
    66  		switch {
    67  		case r == 0:
    68  			s.error(s.offset, "illegal character NUL")
    69  		case r >= utf8.RuneSelf:
    70  			// not ASCII
    71  			r, w = utf8.DecodeRune(s.src[s.rdOffset:])
    72  			if r == utf8.RuneError && w == 1 {
    73  				s.error(s.offset, "illegal UTF-8 encoding")
    74  			} else if r == bom && s.offset > 0 {
    75  				s.error(s.offset, "illegal byte order mark")
    76  			}
    77  		}
    78  		s.rdOffset += w
    79  		s.ch = r
    80  	} else {
    81  		s.offset = len(s.src)
    82  		if s.ch == '\n' {
    83  			s.lineOffset = s.offset
    84  			s.file.AddLine(s.offset)
    85  		}
    86  		s.ch = eof
    87  	}
    88  }
    89  
    90  // peek returns the byte following the most recently read character without
    91  // advancing the scanner. If the scanner is at EOF, peek returns 0.
    92  func (s *Scanner) peek() byte {
    93  	if s.rdOffset < len(s.src) {
    94  		return s.src[s.rdOffset]
    95  	}
    96  	return 0
    97  }
    98  
    99  // A mode value is a set of flags (or 0).
   100  // They control scanner behavior.
   101  type Mode uint
   102  
   103  const (
   104  	ScanComments    Mode = 1 << iota // return comments as COMMENT tokens
   105  	dontInsertSemis                  // do not automatically insert semicolons - for testing only
   106  )
   107  
   108  // Init prepares the scanner s to tokenize the text src by setting the
   109  // scanner at the beginning of src. The scanner uses the file set file
   110  // for position information and it adds line information for each line.
   111  // It is ok to re-use the same file when re-scanning the same file as
   112  // line information which is already present is ignored. Init causes a
   113  // panic if the file size does not match the src size.
   114  //
   115  // Calls to Scan will invoke the error handler err if they encounter a
   116  // syntax error and err is not nil. Also, for each error encountered,
   117  // the Scanner field ErrorCount is incremented by one. The mode parameter
   118  // determines how comments are handled.
   119  //
   120  // Note that Init may call err if there is an error in the first character
   121  // of the file.
   122  func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {
   123  	// Explicitly initialize all fields since a scanner may be reused.
   124  	if file.Size() != len(src) {
   125  		panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
   126  	}
   127  	s.file = file
   128  	s.dir, _ = filepath.Split(file.Name())
   129  	s.src = src
   130  	s.err = err
   131  	s.mode = mode
   132  
   133  	s.ch = ' '
   134  	s.offset = 0
   135  	s.rdOffset = 0
   136  	s.lineOffset = 0
   137  	s.insertSemi = false
   138  	s.ErrorCount = 0
   139  
   140  	s.next()
   141  	if s.ch == bom {
   142  		s.next() // ignore BOM at file beginning
   143  	}
   144  }
   145  
   146  func (s *Scanner) error(offs int, msg string) {
   147  	if s.err != nil {
   148  		s.err(s.file.Position(s.file.Pos(offs)), msg)
   149  	}
   150  	s.ErrorCount++
   151  }
   152  
   153  func (s *Scanner) errorf(offs int, format string, args ...any) {
   154  	s.error(offs, fmt.Sprintf(format, args...))
   155  }
   156  
   157  func (s *Scanner) scanComment() string {
   158  	// initial '/' already consumed; s.ch == '/' || s.ch == '*'
   159  	offs := s.offset - 1 // position of initial '/'
   160  	next := -1           // position immediately following the comment; < 0 means invalid comment
   161  	numCR := 0
   162  
   163  	if s.ch == '/' {
   164  		//-style comment
   165  		// (the final '\n' is not considered part of the comment)
   166  		s.next()
   167  		for s.ch != '\n' && s.ch >= 0 {
   168  			if s.ch == '\r' {
   169  				numCR++
   170  			}
   171  			s.next()
   172  		}
   173  		// if we are at '\n', the position following the comment is afterwards
   174  		next = s.offset
   175  		if s.ch == '\n' {
   176  			next++
   177  		}
   178  		goto exit
   179  	}
   180  
   181  	/*-style comment */
   182  	s.next()
   183  	for s.ch >= 0 {
   184  		ch := s.ch
   185  		if ch == '\r' {
   186  			numCR++
   187  		}
   188  		s.next()
   189  		if ch == '*' && s.ch == '/' {
   190  			s.next()
   191  			next = s.offset
   192  			goto exit
   193  		}
   194  	}
   195  
   196  	s.error(offs, "comment not terminated")
   197  
   198  exit:
   199  	lit := s.src[offs:s.offset]
   200  
   201  	// On Windows, a (//-comment) line may end in "\r\n".
   202  	// Remove the final '\r' before analyzing the text for
   203  	// line directives (matching the compiler). Remove any
   204  	// other '\r' afterwards (matching the pre-existing be-
   205  	// havior of the scanner).
   206  	if numCR > 0 && len(lit) >= 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
   207  		lit = lit[:len(lit)-1]
   208  		numCR--
   209  	}
   210  
   211  	// interpret line directives
   212  	// (//line directives must start at the beginning of the current line)
   213  	if next >= 0 /* implies valid comment */ && (lit[1] == '*' || offs == s.lineOffset) && bytes.HasPrefix(lit[2:], prefix) {
   214  		s.updateLineInfo(next, offs, lit)
   215  	}
   216  
   217  	if numCR > 0 {
   218  		lit = stripCR(lit, lit[1] == '*')
   219  	}
   220  
   221  	return string(lit)
   222  }
   223  
   224  var prefix = []byte("line ")
   225  
   226  // updateLineInfo parses the incoming comment text at offset offs
   227  // as a line directive. If successful, it updates the line info table
   228  // for the position next per the line directive.
   229  func (s *Scanner) updateLineInfo(next, offs int, text []byte) {
   230  	// extract comment text
   231  	if text[1] == '*' {
   232  		text = text[:len(text)-2] // lop off trailing "*/"
   233  	}
   234  	text = text[7:] // lop off leading "//line " or "/*line "
   235  	offs += 7
   236  
   237  	i, n, ok := trailingDigits(text)
   238  	if i == 0 {
   239  		return // ignore (not a line directive)
   240  	}
   241  	// i > 0
   242  
   243  	if !ok {
   244  		// text has a suffix :xxx but xxx is not a number
   245  		s.error(offs+i, "invalid line number: "+string(text[i:]))
   246  		return
   247  	}
   248  
   249  	var line, col int
   250  	i2, n2, ok2 := trailingDigits(text[:i-1])
   251  	if ok2 {
   252  		//line filename:line:col
   253  		i, i2 = i2, i
   254  		line, col = n2, n
   255  		if col == 0 {
   256  			s.error(offs+i2, "invalid column number: "+string(text[i2:]))
   257  			return
   258  		}
   259  		text = text[:i2-1] // lop off ":col"
   260  	} else {
   261  		//line filename:line
   262  		line = n
   263  	}
   264  
   265  	if line == 0 {
   266  		s.error(offs+i, "invalid line number: "+string(text[i:]))
   267  		return
   268  	}
   269  
   270  	// If we have a column (//line filename:line:col form),
   271  	// an empty filename means to use the previous filename.
   272  	filename := string(text[:i-1]) // lop off ":line", and trim white space
   273  	if filename == "" && ok2 {
   274  		filename = s.file.Position(s.file.Pos(offs)).Filename
   275  	} else if filename != "" {
   276  		// Put a relative filename in the current directory.
   277  		// This is for compatibility with earlier releases.
   278  		// See issue 26671.
   279  		filename = filepath.Clean(filename)
   280  		if !filepath.IsAbs(filename) {
   281  			filename = filepath.Join(s.dir, filename)
   282  		}
   283  	}
   284  
   285  	s.file.AddLineColumnInfo(next, filename, line, col)
   286  }
   287  
   288  func trailingDigits(text []byte) (int, int, bool) {
   289  	i := bytes.LastIndexByte(text, ':') // look from right (Windows filenames may contain ':')
   290  	if i < 0 {
   291  		return 0, 0, false // no ":"
   292  	}
   293  	// i >= 0
   294  	n, err := strconv.ParseUint(string(text[i+1:]), 10, 0)
   295  	return i + 1, int(n), err == nil
   296  }
   297  
   298  func (s *Scanner) findLineEnd() bool {
   299  	// initial '/' already consumed
   300  
   301  	defer func(offs int) {
   302  		// reset scanner state to where it was upon calling findLineEnd
   303  		s.ch = '/'
   304  		s.offset = offs
   305  		s.rdOffset = offs + 1
   306  		s.next() // consume initial '/' again
   307  	}(s.offset - 1)
   308  
   309  	// read ahead until a newline, EOF, or non-comment token is found
   310  	for s.ch == '/' || s.ch == '*' {
   311  		if s.ch == '/' {
   312  			//-style comment always contains a newline
   313  			return true
   314  		}
   315  		/*-style comment: look for newline */
   316  		s.next()
   317  		for s.ch >= 0 {
   318  			ch := s.ch
   319  			if ch == '\n' {
   320  				return true
   321  			}
   322  			s.next()
   323  			if ch == '*' && s.ch == '/' {
   324  				s.next()
   325  				break
   326  			}
   327  		}
   328  		s.skipWhitespace() // s.insertSemi is set
   329  		if s.ch < 0 || s.ch == '\n' {
   330  			return true
   331  		}
   332  		if s.ch != '/' {
   333  			// non-comment token
   334  			return false
   335  		}
   336  		s.next() // consume '/'
   337  	}
   338  
   339  	return false
   340  }
   341  
   342  func isLetter(ch rune) bool {
   343  	return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
   344  }
   345  
   346  func isDigit(ch rune) bool {
   347  	return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
   348  }
   349  
   350  // scanIdentifier reads the string of valid identifier characters at s.offset.
   351  // It must only be called when s.ch is known to be a valid letter.
   352  //
   353  // Be careful when making changes to this function: it is optimized and affects
   354  // scanning performance significantly.
   355  func (s *Scanner) scanIdentifier() string {
   356  	offs := s.offset
   357  
   358  	// Optimize for the common case of an ASCII identifier.
   359  	//
   360  	// Ranging over s.src[s.rdOffset:] lets us avoid some bounds checks, and
   361  	// avoids conversions to runes.
   362  	//
   363  	// In case we encounter a non-ASCII character, fall back on the slower path
   364  	// of calling into s.next().
   365  	for rdOffset, b := range s.src[s.rdOffset:] {
   366  		if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' || b == '_' || '0' <= b && b <= '9' {
   367  			// Avoid assigning a rune for the common case of an ascii character.
   368  			continue
   369  		}
   370  		s.rdOffset += rdOffset
   371  		if 0 < b && b < utf8.RuneSelf {
   372  			// Optimization: we've encountered an ASCII character that's not a letter
   373  			// or number. Avoid the call into s.next() and corresponding set up.
   374  			//
   375  			// Note that s.next() does some line accounting if s.ch is '\n', so this
   376  			// shortcut is only possible because we know that the preceding character
   377  			// is not '\n'.
   378  			s.ch = rune(b)
   379  			s.offset = s.rdOffset
   380  			s.rdOffset++
   381  			goto exit
   382  		}
   383  		// We know that the preceding character is valid for an identifier because
   384  		// scanIdentifier is only called when s.ch is a letter, so calling s.next()
   385  		// at s.rdOffset resets the scanner state.
   386  		s.next()
   387  		for isLetter(s.ch) || isDigit(s.ch) {
   388  			s.next()
   389  		}
   390  		goto exit
   391  	}
   392  	s.offset = len(s.src)
   393  	s.rdOffset = len(s.src)
   394  	s.ch = eof
   395  
   396  exit:
   397  	return string(s.src[offs:s.offset])
   398  }
   399  
   400  func digitVal(ch rune) int {
   401  	switch {
   402  	case '0' <= ch && ch <= '9':
   403  		return int(ch - '0')
   404  	case 'a' <= lower(ch) && lower(ch) <= 'f':
   405  		return int(lower(ch) - 'a' + 10)
   406  	}
   407  	return 16 // larger than any legal digit val
   408  }
   409  
   410  func lower(ch rune) rune     { return ('a' - 'A') | ch } // returns lower-case ch iff ch is ASCII letter
   411  func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
   412  func isHex(ch rune) bool     { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
   413  
   414  // digits accepts the sequence { digit | '_' }.
   415  // If base <= 10, digits accepts any decimal digit but records
   416  // the offset (relative to the source start) of a digit >= base
   417  // in *invalid, if *invalid < 0.
   418  // digits returns a bitset describing whether the sequence contained
   419  // digits (bit 0 is set), or separators '_' (bit 1 is set).
   420  func (s *Scanner) digits(base int, invalid *int) (digsep int) {
   421  	if base <= 10 {
   422  		max := rune('0' + base)
   423  		for isDecimal(s.ch) || s.ch == '_' {
   424  			ds := 1
   425  			if s.ch == '_' {
   426  				ds = 2
   427  			} else if s.ch >= max && *invalid < 0 {
   428  				*invalid = s.offset // record invalid rune offset
   429  			}
   430  			digsep |= ds
   431  			s.next()
   432  		}
   433  	} else {
   434  		for isHex(s.ch) || s.ch == '_' {
   435  			ds := 1
   436  			if s.ch == '_' {
   437  				ds = 2
   438  			}
   439  			digsep |= ds
   440  			s.next()
   441  		}
   442  	}
   443  	return
   444  }
   445  
   446  func (s *Scanner) scanNumber() (token.Token, string) {
   447  	offs := s.offset
   448  	tok := token.ILLEGAL
   449  
   450  	base := 10        // number base
   451  	prefix := rune(0) // one of 0 (decimal), '0' (0-octal), 'x', 'o', or 'b'
   452  	digsep := 0       // bit 0: digit present, bit 1: '_' present
   453  	invalid := -1     // index of invalid digit in literal, or < 0
   454  
   455  	// integer part
   456  	if s.ch != '.' {
   457  		tok = token.INT
   458  		if s.ch == '0' {
   459  			s.next()
   460  			switch lower(s.ch) {
   461  			case 'x':
   462  				s.next()
   463  				base, prefix = 16, 'x'
   464  			case 'o':
   465  				s.next()
   466  				base, prefix = 8, 'o'
   467  			case 'b':
   468  				s.next()
   469  				base, prefix = 2, 'b'
   470  			default:
   471  				base, prefix = 8, '0'
   472  				digsep = 1 // leading 0
   473  			}
   474  		}
   475  		digsep |= s.digits(base, &invalid)
   476  	}
   477  
   478  	// fractional part
   479  	if s.ch == '.' {
   480  		tok = token.FLOAT
   481  		if prefix == 'o' || prefix == 'b' {
   482  			s.error(s.offset, "invalid radix point in "+litname(prefix))
   483  		}
   484  		s.next()
   485  		digsep |= s.digits(base, &invalid)
   486  	}
   487  
   488  	if digsep&1 == 0 {
   489  		s.error(s.offset, litname(prefix)+" has no digits")
   490  	}
   491  
   492  	// exponent
   493  	if e := lower(s.ch); e == 'e' || e == 'p' {
   494  		switch {
   495  		case e == 'e' && prefix != 0 && prefix != '0':
   496  			s.errorf(s.offset, "%q exponent requires decimal mantissa", s.ch)
   497  		case e == 'p' && prefix != 'x':
   498  			s.errorf(s.offset, "%q exponent requires hexadecimal mantissa", s.ch)
   499  		}
   500  		s.next()
   501  		tok = token.FLOAT
   502  		if s.ch == '+' || s.ch == '-' {
   503  			s.next()
   504  		}
   505  		ds := s.digits(10, nil)
   506  		digsep |= ds
   507  		if ds&1 == 0 {
   508  			s.error(s.offset, "exponent has no digits")
   509  		}
   510  	} else if prefix == 'x' && tok == token.FLOAT {
   511  		s.error(s.offset, "hexadecimal mantissa requires a 'p' exponent")
   512  	}
   513  
   514  	// suffix 'i'
   515  	if s.ch == 'i' {
   516  		tok = token.IMAG
   517  		s.next()
   518  	}
   519  
   520  	lit := string(s.src[offs:s.offset])
   521  	if tok == token.INT && invalid >= 0 {
   522  		s.errorf(invalid, "invalid digit %q in %s", lit[invalid-offs], litname(prefix))
   523  	}
   524  	if digsep&2 != 0 {
   525  		if i := invalidSep(lit); i >= 0 {
   526  			s.error(offs+i, "'_' must separate successive digits")
   527  		}
   528  	}
   529  
   530  	return tok, lit
   531  }
   532  
   533  func litname(prefix rune) string {
   534  	switch prefix {
   535  	case 'x':
   536  		return "hexadecimal literal"
   537  	case 'o', '0':
   538  		return "octal literal"
   539  	case 'b':
   540  		return "binary literal"
   541  	}
   542  	return "decimal literal"
   543  }
   544  
   545  // invalidSep returns the index of the first invalid separator in x, or -1.
   546  func invalidSep(x string) int {
   547  	x1 := ' ' // prefix char, we only care if it's 'x'
   548  	d := '.'  // digit, one of '_', '0' (a digit), or '.' (anything else)
   549  	i := 0
   550  
   551  	// a prefix counts as a digit
   552  	if len(x) >= 2 && x[0] == '0' {
   553  		x1 = lower(rune(x[1]))
   554  		if x1 == 'x' || x1 == 'o' || x1 == 'b' {
   555  			d = '0'
   556  			i = 2
   557  		}
   558  	}
   559  
   560  	// mantissa and exponent
   561  	for ; i < len(x); i++ {
   562  		p := d // previous digit
   563  		d = rune(x[i])
   564  		switch {
   565  		case d == '_':
   566  			if p != '0' {
   567  				return i
   568  			}
   569  		case isDecimal(d) || x1 == 'x' && isHex(d):
   570  			d = '0'
   571  		default:
   572  			if p == '_' {
   573  				return i - 1
   574  			}
   575  			d = '.'
   576  		}
   577  	}
   578  	if d == '_' {
   579  		return len(x) - 1
   580  	}
   581  
   582  	return -1
   583  }
   584  
   585  // scanEscape parses an escape sequence where rune is the accepted
   586  // escaped quote. In case of a syntax error, it stops at the offending
   587  // character (without consuming it) and returns false. Otherwise
   588  // it returns true.
   589  func (s *Scanner) scanEscape(quote rune) bool {
   590  	offs := s.offset
   591  
   592  	var n int
   593  	var base, max uint32
   594  	switch s.ch {
   595  	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
   596  		s.next()
   597  		return true
   598  	case '0', '1', '2', '3', '4', '5', '6', '7':
   599  		n, base, max = 3, 8, 255
   600  	case 'x':
   601  		s.next()
   602  		n, base, max = 2, 16, 255
   603  	case 'u':
   604  		s.next()
   605  		n, base, max = 4, 16, unicode.MaxRune
   606  	case 'U':
   607  		s.next()
   608  		n, base, max = 8, 16, unicode.MaxRune
   609  	default:
   610  		msg := "unknown escape sequence"
   611  		if s.ch < 0 {
   612  			msg = "escape sequence not terminated"
   613  		}
   614  		s.error(offs, msg)
   615  		return false
   616  	}
   617  
   618  	var x uint32
   619  	for n > 0 {
   620  		d := uint32(digitVal(s.ch))
   621  		if d >= base {
   622  			msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch)
   623  			if s.ch < 0 {
   624  				msg = "escape sequence not terminated"
   625  			}
   626  			s.error(s.offset, msg)
   627  			return false
   628  		}
   629  		x = x*base + d
   630  		s.next()
   631  		n--
   632  	}
   633  
   634  	if x > max || 0xD800 <= x && x < 0xE000 {
   635  		s.error(offs, "escape sequence is invalid Unicode code point")
   636  		return false
   637  	}
   638  
   639  	return true
   640  }
   641  
   642  func (s *Scanner) scanRune() string {
   643  	// '\'' opening already consumed
   644  	offs := s.offset - 1
   645  
   646  	valid := true
   647  	n := 0
   648  	for {
   649  		ch := s.ch
   650  		if ch == '\n' || ch < 0 {
   651  			// only report error if we don't have one already
   652  			if valid {
   653  				s.error(offs, "rune literal not terminated")
   654  				valid = false
   655  			}
   656  			break
   657  		}
   658  		s.next()
   659  		if ch == '\'' {
   660  			break
   661  		}
   662  		n++
   663  		if ch == '\\' {
   664  			if !s.scanEscape('\'') {
   665  				valid = false
   666  			}
   667  			// continue to read to closing quote
   668  		}
   669  	}
   670  
   671  	if valid && n != 1 {
   672  		s.error(offs, "illegal rune literal")
   673  	}
   674  
   675  	return string(s.src[offs:s.offset])
   676  }
   677  
   678  func (s *Scanner) scanString() string {
   679  	// '"' opening already consumed
   680  	offs := s.offset - 1
   681  
   682  	for {
   683  		ch := s.ch
   684  		if ch == '\n' || ch < 0 {
   685  			s.error(offs, "string literal not terminated")
   686  			break
   687  		}
   688  		s.next()
   689  		if ch == '"' {
   690  			break
   691  		}
   692  		if ch == '\\' {
   693  			s.scanEscape('"')
   694  		}
   695  	}
   696  
   697  	return string(s.src[offs:s.offset])
   698  }
   699  
   700  func stripCR(b []byte, comment bool) []byte {
   701  	c := make([]byte, len(b))
   702  	i := 0
   703  	for j, ch := range b {
   704  		// In a /*-style comment, don't strip \r from *\r/ (incl.
   705  		// sequences of \r from *\r\r...\r/) since the resulting
   706  		// */ would terminate the comment too early unless the \r
   707  		// is immediately following the opening /* in which case
   708  		// it's ok because /*/ is not closed yet (issue #11151).
   709  		if ch != '\r' || comment && i > len("/*") && c[i-1] == '*' && j+1 < len(b) && b[j+1] == '/' {
   710  			c[i] = ch
   711  			i++
   712  		}
   713  	}
   714  	return c[:i]
   715  }
   716  
   717  func (s *Scanner) scanRawString() string {
   718  	// '`' opening already consumed
   719  	offs := s.offset - 1
   720  
   721  	hasCR := false
   722  	for {
   723  		ch := s.ch
   724  		if ch < 0 {
   725  			s.error(offs, "raw string literal not terminated")
   726  			break
   727  		}
   728  		s.next()
   729  		if ch == '`' {
   730  			break
   731  		}
   732  		if ch == '\r' {
   733  			hasCR = true
   734  		}
   735  	}
   736  
   737  	lit := s.src[offs:s.offset]
   738  	if hasCR {
   739  		lit = stripCR(lit, false)
   740  	}
   741  
   742  	return string(lit)
   743  }
   744  
   745  func (s *Scanner) skipWhitespace() {
   746  	for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' {
   747  		s.next()
   748  	}
   749  }
   750  
   751  // Helper functions for scanning multi-byte tokens such as >> += >>= .
   752  // Different routines recognize different length tok_i based on matches
   753  // of ch_i. If a token ends in '=', the result is tok1 or tok3
   754  // respectively. Otherwise, the result is tok0 if there was no other
   755  // matching character, or tok2 if the matching character was ch2.
   756  
   757  func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token {
   758  	if s.ch == '=' {
   759  		s.next()
   760  		return tok1
   761  	}
   762  	return tok0
   763  }
   764  
   765  func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token {
   766  	if s.ch == '=' {
   767  		s.next()
   768  		return tok1
   769  	}
   770  	if s.ch == ch2 {
   771  		s.next()
   772  		return tok2
   773  	}
   774  	return tok0
   775  }
   776  
   777  func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token {
   778  	if s.ch == '=' {
   779  		s.next()
   780  		return tok1
   781  	}
   782  	if s.ch == ch2 {
   783  		s.next()
   784  		if s.ch == '=' {
   785  			s.next()
   786  			return tok3
   787  		}
   788  		return tok2
   789  	}
   790  	return tok0
   791  }
   792  
   793  // Scan scans the next token and returns the token position, the token,
   794  // and its literal string if applicable. The source end is indicated by
   795  // token.EOF.
   796  //
   797  // If the returned token is a literal (token.IDENT, token.INT, token.FLOAT,
   798  // token.IMAG, token.CHAR, token.STRING) or token.COMMENT, the literal string
   799  // has the corresponding value.
   800  //
   801  // If the returned token is a keyword, the literal string is the keyword.
   802  //
   803  // If the returned token is token.SEMICOLON, the corresponding
   804  // literal string is ";" if the semicolon was present in the source,
   805  // and "\n" if the semicolon was inserted because of a newline or
   806  // at EOF.
   807  //
   808  // If the returned token is token.ILLEGAL, the literal string is the
   809  // offending character.
   810  //
   811  // In all other cases, Scan returns an empty literal string.
   812  //
   813  // For more tolerant parsing, Scan will return a valid token if
   814  // possible even if a syntax error was encountered. Thus, even
   815  // if the resulting token sequence contains no illegal tokens,
   816  // a client may not assume that no error occurred. Instead it
   817  // must check the scanner's ErrorCount or the number of calls
   818  // of the error handler, if there was one installed.
   819  //
   820  // Scan adds line information to the file added to the file
   821  // set with Init. Token positions are relative to that file
   822  // and thus relative to the file set.
   823  func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
   824  scanAgain:
   825  	s.skipWhitespace()
   826  
   827  	// current token start
   828  	pos = s.file.Pos(s.offset)
   829  
   830  	// determine token value
   831  	insertSemi := false
   832  	switch ch := s.ch; {
   833  	case isLetter(ch):
   834  		lit = s.scanIdentifier()
   835  		if len(lit) > 1 {
   836  			// keywords are longer than one letter - avoid lookup otherwise
   837  			tok = token.Lookup(lit)
   838  			switch tok {
   839  			case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
   840  				insertSemi = true
   841  			}
   842  		} else {
   843  			insertSemi = true
   844  			tok = token.IDENT
   845  		}
   846  	case isDecimal(ch) || ch == '.' && isDecimal(rune(s.peek())):
   847  		insertSemi = true
   848  		tok, lit = s.scanNumber()
   849  	default:
   850  		s.next() // always make progress
   851  		switch ch {
   852  		case -1:
   853  			if s.insertSemi {
   854  				s.insertSemi = false // EOF consumed
   855  				return pos, token.SEMICOLON, "\n"
   856  			}
   857  			tok = token.EOF
   858  		case '\n':
   859  			// we only reach here if s.insertSemi was
   860  			// set in the first place and exited early
   861  			// from s.skipWhitespace()
   862  			s.insertSemi = false // newline consumed
   863  			return pos, token.SEMICOLON, "\n"
   864  		case '"':
   865  			insertSemi = true
   866  			tok = token.STRING
   867  			lit = s.scanString()
   868  		case '\'':
   869  			insertSemi = true
   870  			tok = token.CHAR
   871  			lit = s.scanRune()
   872  		case '`':
   873  			insertSemi = true
   874  			tok = token.STRING
   875  			lit = s.scanRawString()
   876  		case ':':
   877  			tok = s.switch2(token.COLON, token.DEFINE)
   878  		case '.':
   879  			// fractions starting with a '.' are handled by outer switch
   880  			tok = token.PERIOD
   881  			if s.ch == '.' && s.peek() == '.' {
   882  				s.next()
   883  				s.next() // consume last '.'
   884  				tok = token.ELLIPSIS
   885  			}
   886  		case ',':
   887  			tok = token.COMMA
   888  		case ';':
   889  			tok = token.SEMICOLON
   890  			lit = ";"
   891  		case '(':
   892  			tok = token.LPAREN
   893  		case ')':
   894  			insertSemi = true
   895  			tok = token.RPAREN
   896  		case '[':
   897  			tok = token.LBRACK
   898  		case ']':
   899  			insertSemi = true
   900  			tok = token.RBRACK
   901  		case '{':
   902  			tok = token.LBRACE
   903  		case '}':
   904  			insertSemi = true
   905  			tok = token.RBRACE
   906  		case '+':
   907  			tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
   908  			if tok == token.INC {
   909  				insertSemi = true
   910  			}
   911  		case '-':
   912  			tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
   913  			if tok == token.DEC {
   914  				insertSemi = true
   915  			}
   916  		case '*':
   917  			tok = s.switch2(token.MUL, token.MUL_ASSIGN)
   918  		case '/':
   919  			if s.ch == '/' || s.ch == '*' {
   920  				// comment
   921  				if s.insertSemi && s.findLineEnd() {
   922  					// reset position to the beginning of the comment
   923  					s.ch = '/'
   924  					s.offset = s.file.Offset(pos)
   925  					s.rdOffset = s.offset + 1
   926  					s.insertSemi = false // newline consumed
   927  					return pos, token.SEMICOLON, "\n"
   928  				}
   929  				comment := s.scanComment()
   930  				if s.mode&ScanComments == 0 {
   931  					// skip comment
   932  					s.insertSemi = false // newline consumed
   933  					goto scanAgain
   934  				}
   935  				tok = token.COMMENT
   936  				lit = comment
   937  			} else {
   938  				tok = s.switch2(token.QUO, token.QUO_ASSIGN)
   939  			}
   940  		case '%':
   941  			tok = s.switch2(token.REM, token.REM_ASSIGN)
   942  		case '^':
   943  			tok = s.switch2(token.XOR, token.XOR_ASSIGN)
   944  		case '<':
   945  			if s.ch == '-' {
   946  				s.next()
   947  				tok = token.ARROW
   948  			} else {
   949  				tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
   950  			}
   951  		case '>':
   952  			tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
   953  		case '=':
   954  			tok = s.switch2(token.ASSIGN, token.EQL)
   955  		case '!':
   956  			tok = s.switch2(token.NOT, token.NEQ)
   957  		case '&':
   958  			if s.ch == '^' {
   959  				s.next()
   960  				tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
   961  			} else {
   962  				tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
   963  			}
   964  		case '|':
   965  			tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
   966  		case '~':
   967  			tok = token.TILDE
   968  		default:
   969  			// next reports unexpected BOMs - don't repeat
   970  			if ch != bom {
   971  				s.errorf(s.file.Offset(pos), "illegal character %#U", ch)
   972  			}
   973  			insertSemi = s.insertSemi // preserve insertSemi info
   974  			tok = token.ILLEGAL
   975  			lit = string(ch)
   976  		}
   977  	}
   978  	if s.mode&dontInsertSemis == 0 {
   979  		s.insertSemi = insertSemi
   980  	}
   981  
   982  	return
   983  }
   984  

View as plain text