Source file
src/go/scanner/scanner.go
1
2
3
4
5
6
7
8 package scanner
9
10 import (
11 "bytes"
12 "fmt"
13 "go/token"
14 "path/filepath"
15 "strconv"
16 "unicode"
17 "unicode/utf8"
18 )
19
20
21
22
23
24 type ErrorHandler func(pos token.Position, msg string)
25
26
27
28
29 type Scanner struct {
30
31 file *token.File
32 dir string
33 src []byte
34 err ErrorHandler
35 mode Mode
36
37
38 ch rune
39 offset int
40 rdOffset int
41 lineOffset int
42 insertSemi bool
43
44
45 ErrorCount int
46 }
47
48 const (
49 bom = 0xFEFF
50 eof = -1
51 )
52
53
54
55
56
57
58 func (s *Scanner) next() {
59 if s.rdOffset < len(s.src) {
60 s.offset = s.rdOffset
61 if s.ch == '\n' {
62 s.lineOffset = s.offset
63 s.file.AddLine(s.offset)
64 }
65 r, w := rune(s.src[s.rdOffset]), 1
66 switch {
67 case r == 0:
68 s.error(s.offset, "illegal character NUL")
69 case r >= utf8.RuneSelf:
70
71 r, w = utf8.DecodeRune(s.src[s.rdOffset:])
72 if r == utf8.RuneError && w == 1 {
73 s.error(s.offset, "illegal UTF-8 encoding")
74 } else if r == bom && s.offset > 0 {
75 s.error(s.offset, "illegal byte order mark")
76 }
77 }
78 s.rdOffset += w
79 s.ch = r
80 } else {
81 s.offset = len(s.src)
82 if s.ch == '\n' {
83 s.lineOffset = s.offset
84 s.file.AddLine(s.offset)
85 }
86 s.ch = eof
87 }
88 }
89
90
91
92 func (s *Scanner) peek() byte {
93 if s.rdOffset < len(s.src) {
94 return s.src[s.rdOffset]
95 }
96 return 0
97 }
98
99
100
101 type Mode uint
102
103 const (
104 ScanComments Mode = 1 << iota
105 dontInsertSemis
106 )
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122 func (s *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode Mode) {
123
124 if file.Size() != len(src) {
125 panic(fmt.Sprintf("file size (%d) does not match src len (%d)", file.Size(), len(src)))
126 }
127 s.file = file
128 s.dir, _ = filepath.Split(file.Name())
129 s.src = src
130 s.err = err
131 s.mode = mode
132
133 s.ch = ' '
134 s.offset = 0
135 s.rdOffset = 0
136 s.lineOffset = 0
137 s.insertSemi = false
138 s.ErrorCount = 0
139
140 s.next()
141 if s.ch == bom {
142 s.next()
143 }
144 }
145
146 func (s *Scanner) error(offs int, msg string) {
147 if s.err != nil {
148 s.err(s.file.Position(s.file.Pos(offs)), msg)
149 }
150 s.ErrorCount++
151 }
152
153 func (s *Scanner) errorf(offs int, format string, args ...any) {
154 s.error(offs, fmt.Sprintf(format, args...))
155 }
156
157 func (s *Scanner) scanComment() string {
158
159 offs := s.offset - 1
160 next := -1
161 numCR := 0
162
163 if s.ch == '/' {
164
165
166 s.next()
167 for s.ch != '\n' && s.ch >= 0 {
168 if s.ch == '\r' {
169 numCR++
170 }
171 s.next()
172 }
173
174 next = s.offset
175 if s.ch == '\n' {
176 next++
177 }
178 goto exit
179 }
180
181
182 s.next()
183 for s.ch >= 0 {
184 ch := s.ch
185 if ch == '\r' {
186 numCR++
187 }
188 s.next()
189 if ch == '*' && s.ch == '/' {
190 s.next()
191 next = s.offset
192 goto exit
193 }
194 }
195
196 s.error(offs, "comment not terminated")
197
198 exit:
199 lit := s.src[offs:s.offset]
200
201
202
203
204
205
206 if numCR > 0 && len(lit) >= 2 && lit[1] == '/' && lit[len(lit)-1] == '\r' {
207 lit = lit[:len(lit)-1]
208 numCR--
209 }
210
211
212
213 if next >= 0 && (lit[1] == '*' || offs == s.lineOffset) && bytes.HasPrefix(lit[2:], prefix) {
214 s.updateLineInfo(next, offs, lit)
215 }
216
217 if numCR > 0 {
218 lit = stripCR(lit, lit[1] == '*')
219 }
220
221 return string(lit)
222 }
223
224 var prefix = []byte("line ")
225
226
227
228
229 func (s *Scanner) updateLineInfo(next, offs int, text []byte) {
230
231 if text[1] == '*' {
232 text = text[:len(text)-2]
233 }
234 text = text[7:]
235 offs += 7
236
237 i, n, ok := trailingDigits(text)
238 if i == 0 {
239 return
240 }
241
242
243 if !ok {
244
245 s.error(offs+i, "invalid line number: "+string(text[i:]))
246 return
247 }
248
249 var line, col int
250 i2, n2, ok2 := trailingDigits(text[:i-1])
251 if ok2 {
252
253 i, i2 = i2, i
254 line, col = n2, n
255 if col == 0 {
256 s.error(offs+i2, "invalid column number: "+string(text[i2:]))
257 return
258 }
259 text = text[:i2-1]
260 } else {
261
262 line = n
263 }
264
265 if line == 0 {
266 s.error(offs+i, "invalid line number: "+string(text[i:]))
267 return
268 }
269
270
271
272 filename := string(text[:i-1])
273 if filename == "" && ok2 {
274 filename = s.file.Position(s.file.Pos(offs)).Filename
275 } else if filename != "" {
276
277
278
279 filename = filepath.Clean(filename)
280 if !filepath.IsAbs(filename) {
281 filename = filepath.Join(s.dir, filename)
282 }
283 }
284
285 s.file.AddLineColumnInfo(next, filename, line, col)
286 }
287
288 func trailingDigits(text []byte) (int, int, bool) {
289 i := bytes.LastIndexByte(text, ':')
290 if i < 0 {
291 return 0, 0, false
292 }
293
294 n, err := strconv.ParseUint(string(text[i+1:]), 10, 0)
295 return i + 1, int(n), err == nil
296 }
297
298 func (s *Scanner) findLineEnd() bool {
299
300
301 defer func(offs int) {
302
303 s.ch = '/'
304 s.offset = offs
305 s.rdOffset = offs + 1
306 s.next()
307 }(s.offset - 1)
308
309
310 for s.ch == '/' || s.ch == '*' {
311 if s.ch == '/' {
312
313 return true
314 }
315
316 s.next()
317 for s.ch >= 0 {
318 ch := s.ch
319 if ch == '\n' {
320 return true
321 }
322 s.next()
323 if ch == '*' && s.ch == '/' {
324 s.next()
325 break
326 }
327 }
328 s.skipWhitespace()
329 if s.ch < 0 || s.ch == '\n' {
330 return true
331 }
332 if s.ch != '/' {
333
334 return false
335 }
336 s.next()
337 }
338
339 return false
340 }
341
342 func isLetter(ch rune) bool {
343 return 'a' <= lower(ch) && lower(ch) <= 'z' || ch == '_' || ch >= utf8.RuneSelf && unicode.IsLetter(ch)
344 }
345
346 func isDigit(ch rune) bool {
347 return isDecimal(ch) || ch >= utf8.RuneSelf && unicode.IsDigit(ch)
348 }
349
350
351
352
353
354
355 func (s *Scanner) scanIdentifier() string {
356 offs := s.offset
357
358
359
360
361
362
363
364
365 for rdOffset, b := range s.src[s.rdOffset:] {
366 if 'a' <= b && b <= 'z' || 'A' <= b && b <= 'Z' || b == '_' || '0' <= b && b <= '9' {
367
368 continue
369 }
370 s.rdOffset += rdOffset
371 if 0 < b && b < utf8.RuneSelf {
372
373
374
375
376
377
378 s.ch = rune(b)
379 s.offset = s.rdOffset
380 s.rdOffset++
381 goto exit
382 }
383
384
385
386 s.next()
387 for isLetter(s.ch) || isDigit(s.ch) {
388 s.next()
389 }
390 goto exit
391 }
392 s.offset = len(s.src)
393 s.rdOffset = len(s.src)
394 s.ch = eof
395
396 exit:
397 return string(s.src[offs:s.offset])
398 }
399
400 func digitVal(ch rune) int {
401 switch {
402 case '0' <= ch && ch <= '9':
403 return int(ch - '0')
404 case 'a' <= lower(ch) && lower(ch) <= 'f':
405 return int(lower(ch) - 'a' + 10)
406 }
407 return 16
408 }
409
410 func lower(ch rune) rune { return ('a' - 'A') | ch }
411 func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
412 func isHex(ch rune) bool { return '0' <= ch && ch <= '9' || 'a' <= lower(ch) && lower(ch) <= 'f' }
413
414
415
416
417
418
419
420 func (s *Scanner) digits(base int, invalid *int) (digsep int) {
421 if base <= 10 {
422 max := rune('0' + base)
423 for isDecimal(s.ch) || s.ch == '_' {
424 ds := 1
425 if s.ch == '_' {
426 ds = 2
427 } else if s.ch >= max && *invalid < 0 {
428 *invalid = s.offset
429 }
430 digsep |= ds
431 s.next()
432 }
433 } else {
434 for isHex(s.ch) || s.ch == '_' {
435 ds := 1
436 if s.ch == '_' {
437 ds = 2
438 }
439 digsep |= ds
440 s.next()
441 }
442 }
443 return
444 }
445
446 func (s *Scanner) scanNumber() (token.Token, string) {
447 offs := s.offset
448 tok := token.ILLEGAL
449
450 base := 10
451 prefix := rune(0)
452 digsep := 0
453 invalid := -1
454
455
456 if s.ch != '.' {
457 tok = token.INT
458 if s.ch == '0' {
459 s.next()
460 switch lower(s.ch) {
461 case 'x':
462 s.next()
463 base, prefix = 16, 'x'
464 case 'o':
465 s.next()
466 base, prefix = 8, 'o'
467 case 'b':
468 s.next()
469 base, prefix = 2, 'b'
470 default:
471 base, prefix = 8, '0'
472 digsep = 1
473 }
474 }
475 digsep |= s.digits(base, &invalid)
476 }
477
478
479 if s.ch == '.' {
480 tok = token.FLOAT
481 if prefix == 'o' || prefix == 'b' {
482 s.error(s.offset, "invalid radix point in "+litname(prefix))
483 }
484 s.next()
485 digsep |= s.digits(base, &invalid)
486 }
487
488 if digsep&1 == 0 {
489 s.error(s.offset, litname(prefix)+" has no digits")
490 }
491
492
493 if e := lower(s.ch); e == 'e' || e == 'p' {
494 switch {
495 case e == 'e' && prefix != 0 && prefix != '0':
496 s.errorf(s.offset, "%q exponent requires decimal mantissa", s.ch)
497 case e == 'p' && prefix != 'x':
498 s.errorf(s.offset, "%q exponent requires hexadecimal mantissa", s.ch)
499 }
500 s.next()
501 tok = token.FLOAT
502 if s.ch == '+' || s.ch == '-' {
503 s.next()
504 }
505 ds := s.digits(10, nil)
506 digsep |= ds
507 if ds&1 == 0 {
508 s.error(s.offset, "exponent has no digits")
509 }
510 } else if prefix == 'x' && tok == token.FLOAT {
511 s.error(s.offset, "hexadecimal mantissa requires a 'p' exponent")
512 }
513
514
515 if s.ch == 'i' {
516 tok = token.IMAG
517 s.next()
518 }
519
520 lit := string(s.src[offs:s.offset])
521 if tok == token.INT && invalid >= 0 {
522 s.errorf(invalid, "invalid digit %q in %s", lit[invalid-offs], litname(prefix))
523 }
524 if digsep&2 != 0 {
525 if i := invalidSep(lit); i >= 0 {
526 s.error(offs+i, "'_' must separate successive digits")
527 }
528 }
529
530 return tok, lit
531 }
532
533 func litname(prefix rune) string {
534 switch prefix {
535 case 'x':
536 return "hexadecimal literal"
537 case 'o', '0':
538 return "octal literal"
539 case 'b':
540 return "binary literal"
541 }
542 return "decimal literal"
543 }
544
545
546 func invalidSep(x string) int {
547 x1 := ' '
548 d := '.'
549 i := 0
550
551
552 if len(x) >= 2 && x[0] == '0' {
553 x1 = lower(rune(x[1]))
554 if x1 == 'x' || x1 == 'o' || x1 == 'b' {
555 d = '0'
556 i = 2
557 }
558 }
559
560
561 for ; i < len(x); i++ {
562 p := d
563 d = rune(x[i])
564 switch {
565 case d == '_':
566 if p != '0' {
567 return i
568 }
569 case isDecimal(d) || x1 == 'x' && isHex(d):
570 d = '0'
571 default:
572 if p == '_' {
573 return i - 1
574 }
575 d = '.'
576 }
577 }
578 if d == '_' {
579 return len(x) - 1
580 }
581
582 return -1
583 }
584
585
586
587
588
589 func (s *Scanner) scanEscape(quote rune) bool {
590 offs := s.offset
591
592 var n int
593 var base, max uint32
594 switch s.ch {
595 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
596 s.next()
597 return true
598 case '0', '1', '2', '3', '4', '5', '6', '7':
599 n, base, max = 3, 8, 255
600 case 'x':
601 s.next()
602 n, base, max = 2, 16, 255
603 case 'u':
604 s.next()
605 n, base, max = 4, 16, unicode.MaxRune
606 case 'U':
607 s.next()
608 n, base, max = 8, 16, unicode.MaxRune
609 default:
610 msg := "unknown escape sequence"
611 if s.ch < 0 {
612 msg = "escape sequence not terminated"
613 }
614 s.error(offs, msg)
615 return false
616 }
617
618 var x uint32
619 for n > 0 {
620 d := uint32(digitVal(s.ch))
621 if d >= base {
622 msg := fmt.Sprintf("illegal character %#U in escape sequence", s.ch)
623 if s.ch < 0 {
624 msg = "escape sequence not terminated"
625 }
626 s.error(s.offset, msg)
627 return false
628 }
629 x = x*base + d
630 s.next()
631 n--
632 }
633
634 if x > max || 0xD800 <= x && x < 0xE000 {
635 s.error(offs, "escape sequence is invalid Unicode code point")
636 return false
637 }
638
639 return true
640 }
641
642 func (s *Scanner) scanRune() string {
643
644 offs := s.offset - 1
645
646 valid := true
647 n := 0
648 for {
649 ch := s.ch
650 if ch == '\n' || ch < 0 {
651
652 if valid {
653 s.error(offs, "rune literal not terminated")
654 valid = false
655 }
656 break
657 }
658 s.next()
659 if ch == '\'' {
660 break
661 }
662 n++
663 if ch == '\\' {
664 if !s.scanEscape('\'') {
665 valid = false
666 }
667
668 }
669 }
670
671 if valid && n != 1 {
672 s.error(offs, "illegal rune literal")
673 }
674
675 return string(s.src[offs:s.offset])
676 }
677
678 func (s *Scanner) scanString() string {
679
680 offs := s.offset - 1
681
682 for {
683 ch := s.ch
684 if ch == '\n' || ch < 0 {
685 s.error(offs, "string literal not terminated")
686 break
687 }
688 s.next()
689 if ch == '"' {
690 break
691 }
692 if ch == '\\' {
693 s.scanEscape('"')
694 }
695 }
696
697 return string(s.src[offs:s.offset])
698 }
699
700 func stripCR(b []byte, comment bool) []byte {
701 c := make([]byte, len(b))
702 i := 0
703 for j, ch := range b {
704
705
706
707
708
709 if ch != '\r' || comment && i > len("/*") && c[i-1] == '*' && j+1 < len(b) && b[j+1] == '/' {
710 c[i] = ch
711 i++
712 }
713 }
714 return c[:i]
715 }
716
717 func (s *Scanner) scanRawString() string {
718
719 offs := s.offset - 1
720
721 hasCR := false
722 for {
723 ch := s.ch
724 if ch < 0 {
725 s.error(offs, "raw string literal not terminated")
726 break
727 }
728 s.next()
729 if ch == '`' {
730 break
731 }
732 if ch == '\r' {
733 hasCR = true
734 }
735 }
736
737 lit := s.src[offs:s.offset]
738 if hasCR {
739 lit = stripCR(lit, false)
740 }
741
742 return string(lit)
743 }
744
745 func (s *Scanner) skipWhitespace() {
746 for s.ch == ' ' || s.ch == '\t' || s.ch == '\n' && !s.insertSemi || s.ch == '\r' {
747 s.next()
748 }
749 }
750
751
752
753
754
755
756
757 func (s *Scanner) switch2(tok0, tok1 token.Token) token.Token {
758 if s.ch == '=' {
759 s.next()
760 return tok1
761 }
762 return tok0
763 }
764
765 func (s *Scanner) switch3(tok0, tok1 token.Token, ch2 rune, tok2 token.Token) token.Token {
766 if s.ch == '=' {
767 s.next()
768 return tok1
769 }
770 if s.ch == ch2 {
771 s.next()
772 return tok2
773 }
774 return tok0
775 }
776
777 func (s *Scanner) switch4(tok0, tok1 token.Token, ch2 rune, tok2, tok3 token.Token) token.Token {
778 if s.ch == '=' {
779 s.next()
780 return tok1
781 }
782 if s.ch == ch2 {
783 s.next()
784 if s.ch == '=' {
785 s.next()
786 return tok3
787 }
788 return tok2
789 }
790 return tok0
791 }
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823 func (s *Scanner) Scan() (pos token.Pos, tok token.Token, lit string) {
824 scanAgain:
825 s.skipWhitespace()
826
827
828 pos = s.file.Pos(s.offset)
829
830
831 insertSemi := false
832 switch ch := s.ch; {
833 case isLetter(ch):
834 lit = s.scanIdentifier()
835 if len(lit) > 1 {
836
837 tok = token.Lookup(lit)
838 switch tok {
839 case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
840 insertSemi = true
841 }
842 } else {
843 insertSemi = true
844 tok = token.IDENT
845 }
846 case isDecimal(ch) || ch == '.' && isDecimal(rune(s.peek())):
847 insertSemi = true
848 tok, lit = s.scanNumber()
849 default:
850 s.next()
851 switch ch {
852 case -1:
853 if s.insertSemi {
854 s.insertSemi = false
855 return pos, token.SEMICOLON, "\n"
856 }
857 tok = token.EOF
858 case '\n':
859
860
861
862 s.insertSemi = false
863 return pos, token.SEMICOLON, "\n"
864 case '"':
865 insertSemi = true
866 tok = token.STRING
867 lit = s.scanString()
868 case '\'':
869 insertSemi = true
870 tok = token.CHAR
871 lit = s.scanRune()
872 case '`':
873 insertSemi = true
874 tok = token.STRING
875 lit = s.scanRawString()
876 case ':':
877 tok = s.switch2(token.COLON, token.DEFINE)
878 case '.':
879
880 tok = token.PERIOD
881 if s.ch == '.' && s.peek() == '.' {
882 s.next()
883 s.next()
884 tok = token.ELLIPSIS
885 }
886 case ',':
887 tok = token.COMMA
888 case ';':
889 tok = token.SEMICOLON
890 lit = ";"
891 case '(':
892 tok = token.LPAREN
893 case ')':
894 insertSemi = true
895 tok = token.RPAREN
896 case '[':
897 tok = token.LBRACK
898 case ']':
899 insertSemi = true
900 tok = token.RBRACK
901 case '{':
902 tok = token.LBRACE
903 case '}':
904 insertSemi = true
905 tok = token.RBRACE
906 case '+':
907 tok = s.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
908 if tok == token.INC {
909 insertSemi = true
910 }
911 case '-':
912 tok = s.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
913 if tok == token.DEC {
914 insertSemi = true
915 }
916 case '*':
917 tok = s.switch2(token.MUL, token.MUL_ASSIGN)
918 case '/':
919 if s.ch == '/' || s.ch == '*' {
920
921 if s.insertSemi && s.findLineEnd() {
922
923 s.ch = '/'
924 s.offset = s.file.Offset(pos)
925 s.rdOffset = s.offset + 1
926 s.insertSemi = false
927 return pos, token.SEMICOLON, "\n"
928 }
929 comment := s.scanComment()
930 if s.mode&ScanComments == 0 {
931
932 s.insertSemi = false
933 goto scanAgain
934 }
935 tok = token.COMMENT
936 lit = comment
937 } else {
938 tok = s.switch2(token.QUO, token.QUO_ASSIGN)
939 }
940 case '%':
941 tok = s.switch2(token.REM, token.REM_ASSIGN)
942 case '^':
943 tok = s.switch2(token.XOR, token.XOR_ASSIGN)
944 case '<':
945 if s.ch == '-' {
946 s.next()
947 tok = token.ARROW
948 } else {
949 tok = s.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
950 }
951 case '>':
952 tok = s.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
953 case '=':
954 tok = s.switch2(token.ASSIGN, token.EQL)
955 case '!':
956 tok = s.switch2(token.NOT, token.NEQ)
957 case '&':
958 if s.ch == '^' {
959 s.next()
960 tok = s.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
961 } else {
962 tok = s.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
963 }
964 case '|':
965 tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
966 case '~':
967 tok = token.TILDE
968 default:
969
970 if ch != bom {
971 s.errorf(s.file.Offset(pos), "illegal character %#U", ch)
972 }
973 insertSemi = s.insertSemi
974 tok = token.ILLEGAL
975 lit = string(ch)
976 }
977 }
978 if s.mode&dontInsertSemis == 0 {
979 s.insertSemi = insertSemi
980 }
981
982 return
983 }
984
View as plain text