...

Source file src/go/doc/comment/parse.go

Documentation: go/doc/comment

     1  // Copyright 2022 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package comment
     6  
     7  import (
     8  	"sort"
     9  	"strings"
    10  	"unicode"
    11  	"unicode/utf8"
    12  )
    13  
    14  // A Doc is a parsed Go doc comment.
    15  type Doc struct {
    16  	// Content is the sequence of content blocks in the comment.
    17  	Content []Block
    18  
    19  	// Links is the link definitions in the comment.
    20  	Links []*LinkDef
    21  }
    22  
    23  // A LinkDef is a single link definition.
    24  type LinkDef struct {
    25  	Text string // the link text
    26  	URL  string // the link URL
    27  	Used bool   // whether the comment uses the definition
    28  }
    29  
    30  // A Block is block-level content in a doc comment,
    31  // one of [*Code], [*Heading], [*List], or [*Paragraph].
    32  type Block interface {
    33  	block()
    34  }
    35  
    36  // A Heading is a doc comment heading.
    37  type Heading struct {
    38  	Text []Text // the heading text
    39  }
    40  
    41  func (*Heading) block() {}
    42  
    43  // A List is a numbered or bullet list.
    44  // Lists are always non-empty: len(Items) > 0.
    45  // In a numbered list, every Items[i].Number is a non-empty string.
    46  // In a bullet list, every Items[i].Number is an empty string.
    47  type List struct {
    48  	// Items is the list items.
    49  	Items []*ListItem
    50  
    51  	// ForceBlankBefore indicates that the list must be
    52  	// preceded by a blank line when reformatting the comment,
    53  	// overriding the usual conditions. See the BlankBefore method.
    54  	//
    55  	// The comment parser sets ForceBlankBefore for any list
    56  	// that is preceded by a blank line, to make sure
    57  	// the blank line is preserved when printing.
    58  	ForceBlankBefore bool
    59  
    60  	// ForceBlankBetween indicates that list items must be
    61  	// separated by blank lines when reformatting the comment,
    62  	// overriding the usual conditions. See the BlankBetween method.
    63  	//
    64  	// The comment parser sets ForceBlankBetween for any list
    65  	// that has a blank line between any two of its items, to make sure
    66  	// the blank lines are preserved when printing.
    67  	ForceBlankBetween bool
    68  }
    69  
    70  func (*List) block() {}
    71  
    72  // BlankBefore reports whether a reformatting of the comment
    73  // should include a blank line before the list.
    74  // The default rule is the same as for [BlankBetween]:
    75  // if the list item content contains any blank lines
    76  // (meaning at least one item has multiple paragraphs)
    77  // then the list itself must be preceded by a blank line.
    78  // A preceding blank line can be forced by setting [List].ForceBlankBefore.
    79  func (l *List) BlankBefore() bool {
    80  	return l.ForceBlankBefore || l.BlankBetween()
    81  }
    82  
    83  // BlankBetween reports whether a reformatting of the comment
    84  // should include a blank line between each pair of list items.
    85  // The default rule is that if the list item content contains any blank lines
    86  // (meaning at least one item has multiple paragraphs)
    87  // then list items must themselves be separated by blank lines.
    88  // Blank line separators can be forced by setting [List].ForceBlankBetween.
    89  func (l *List) BlankBetween() bool {
    90  	if l.ForceBlankBetween {
    91  		return true
    92  	}
    93  	for _, item := range l.Items {
    94  		if len(item.Content) != 1 {
    95  			// Unreachable for parsed comments today,
    96  			// since the only way to get multiple item.Content
    97  			// is multiple paragraphs, which must have been
    98  			// separated by a blank line.
    99  			return true
   100  		}
   101  	}
   102  	return false
   103  }
   104  
   105  // A ListItem is a single item in a numbered or bullet list.
   106  type ListItem struct {
   107  	// Number is a decimal string in a numbered list
   108  	// or an empty string in a bullet list.
   109  	Number string // "1", "2", ...; "" for bullet list
   110  
   111  	// Content is the list content.
   112  	// Currently, restrictions in the parser and printer
   113  	// require every element of Content to be a *Paragraph.
   114  	Content []Block // Content of this item.
   115  }
   116  
   117  // A Paragraph is a paragraph of text.
   118  type Paragraph struct {
   119  	Text []Text
   120  }
   121  
   122  func (*Paragraph) block() {}
   123  
   124  // A Code is a preformatted code block.
   125  type Code struct {
   126  	// Text is the preformatted text, ending with a newline character.
   127  	// It may be multiple lines, each of which ends with a newline character.
   128  	// It is never empty, nor does it start or end with a blank line.
   129  	Text string
   130  }
   131  
   132  func (*Code) block() {}
   133  
   134  // A Text is text-level content in a doc comment,
   135  // one of [Plain], [Italic], [*Link], or [*DocLink].
   136  type Text interface {
   137  	text()
   138  }
   139  
   140  // A Plain is a string rendered as plain text (not italicized).
   141  type Plain string
   142  
   143  func (Plain) text() {}
   144  
   145  // An Italic is a string rendered as italicized text.
   146  type Italic string
   147  
   148  func (Italic) text() {}
   149  
   150  // A Link is a link to a specific URL.
   151  type Link struct {
   152  	Auto bool   // is this an automatic (implicit) link of a literal URL?
   153  	Text []Text // text of link
   154  	URL  string // target URL of link
   155  }
   156  
   157  func (*Link) text() {}
   158  
   159  // A DocLink is a link to documentation for a Go package or symbol.
   160  type DocLink struct {
   161  	Text []Text // text of link
   162  
   163  	// ImportPath, Recv, and Name identify the Go package or symbol
   164  	// that is the link target. The potential combinations of
   165  	// non-empty fields are:
   166  	//  - ImportPath: a link to another package
   167  	//  - ImportPath, Name: a link to a const, func, type, or var in another package
   168  	//  - ImportPath, Recv, Name: a link to a method in another package
   169  	//  - Name: a link to a const, func, type, or var in this package
   170  	//  - Recv, Name: a link to a method in this package
   171  	ImportPath string // import path
   172  	Recv       string // receiver type, without any pointer star, for methods
   173  	Name       string // const, func, type, var, or method name
   174  }
   175  
   176  func (*DocLink) text() {}
   177  
   178  // A Parser is a doc comment parser.
   179  // The fields in the struct can be filled in before calling Parse
   180  // in order to customize the details of the parsing process.
   181  type Parser struct {
   182  	// Words is a map of Go identifier words that
   183  	// should be italicized and potentially linked.
   184  	// If Words[w] is the empty string, then the word w
   185  	// is only italicized. Otherwise it is linked, using
   186  	// Words[w] as the link target.
   187  	// Words corresponds to the [go/doc.ToHTML] words parameter.
   188  	Words map[string]string
   189  
   190  	// LookupPackage resolves a package name to an import path.
   191  	//
   192  	// If LookupPackage(name) returns ok == true, then [name]
   193  	// (or [name.Sym] or [name.Sym.Method])
   194  	// is considered a documentation link to importPath's package docs.
   195  	// It is valid to return "", true, in which case name is considered
   196  	// to refer to the current package.
   197  	//
   198  	// If LookupPackage(name) returns ok == false,
   199  	// then [name] (or [name.Sym] or [name.Sym.Method])
   200  	// will not be considered a documentation link,
   201  	// except in the case where name is the full (but single-element) import path
   202  	// of a package in the standard library, such as in [math] or [io.Reader].
   203  	// LookupPackage is still called for such names,
   204  	// in order to permit references to imports of other packages
   205  	// with the same package names.
   206  	//
   207  	// Setting LookupPackage to nil is equivalent to setting it to
   208  	// a function that always returns "", false.
   209  	LookupPackage func(name string) (importPath string, ok bool)
   210  
   211  	// LookupSym reports whether a symbol name or method name
   212  	// exists in the current package.
   213  	//
   214  	// If LookupSym("", "Name") returns true, then [Name]
   215  	// is considered a documentation link for a const, func, type, or var.
   216  	//
   217  	// Similarly, if LookupSym("Recv", "Name") returns true,
   218  	// then [Recv.Name] is considered a documentation link for
   219  	// type Recv's method Name.
   220  	//
   221  	// Setting LookupSym to nil is equivalent to setting it to a function
   222  	// that always returns false.
   223  	LookupSym func(recv, name string) (ok bool)
   224  }
   225  
   226  // parseDoc is parsing state for a single doc comment.
   227  type parseDoc struct {
   228  	*Parser
   229  	*Doc
   230  	links     map[string]*LinkDef
   231  	lines     []string
   232  	lookupSym func(recv, name string) bool
   233  }
   234  
   235  // lookupPkg is called to look up the pkg in [pkg], [pkg.Name], and [pkg.Name.Recv].
   236  // If pkg has a slash, it is assumed to be the full import path and is returned with ok = true.
   237  //
   238  // Otherwise, pkg is probably a simple package name like "rand" (not "crypto/rand" or "math/rand").
   239  // d.LookupPackage provides a way for the caller to allow resolving such names with reference
   240  // to the imports in the surrounding package.
   241  //
   242  // There is one collision between these two cases: single-element standard library names
   243  // like "math" are full import paths but don't contain slashes. We let d.LookupPackage have
   244  // the first chance to resolve it, in case there's a different package imported as math,
   245  // and otherwise we refer to a built-in list of single-element standard library package names.
   246  func (d *parseDoc) lookupPkg(pkg string) (importPath string, ok bool) {
   247  	if strings.Contains(pkg, "/") { // assume a full import path
   248  		if validImportPath(pkg) {
   249  			return pkg, true
   250  		}
   251  		return "", false
   252  	}
   253  	if d.LookupPackage != nil {
   254  		// Give LookupPackage a chance.
   255  		if path, ok := d.LookupPackage(pkg); ok {
   256  			return path, true
   257  		}
   258  	}
   259  	return DefaultLookupPackage(pkg)
   260  }
   261  
   262  func isStdPkg(path string) bool {
   263  	// TODO(rsc): Use sort.Find once we don't have to worry about
   264  	// copying this code into older Go environments.
   265  	i := sort.Search(len(stdPkgs), func(i int) bool { return stdPkgs[i] >= path })
   266  	return i < len(stdPkgs) && stdPkgs[i] == path
   267  }
   268  
   269  // DefaultLookupPackage is the default package lookup
   270  // function, used when [Parser].LookupPackage is nil.
   271  // It recognizes names of the packages from the standard
   272  // library with single-element import paths, such as math,
   273  // which would otherwise be impossible to name.
   274  //
   275  // Note that the go/doc package provides a more sophisticated
   276  // lookup based on the imports used in the current package.
   277  func DefaultLookupPackage(name string) (importPath string, ok bool) {
   278  	if isStdPkg(name) {
   279  		return name, true
   280  	}
   281  	return "", false
   282  }
   283  
   284  // Parse parses the doc comment text and returns the *Doc form.
   285  // Comment markers (/* // and */) in the text must have already been removed.
   286  func (p *Parser) Parse(text string) *Doc {
   287  	lines := unindent(strings.Split(text, "\n"))
   288  	d := &parseDoc{
   289  		Parser:    p,
   290  		Doc:       new(Doc),
   291  		links:     make(map[string]*LinkDef),
   292  		lines:     lines,
   293  		lookupSym: func(recv, name string) bool { return false },
   294  	}
   295  	if p.LookupSym != nil {
   296  		d.lookupSym = p.LookupSym
   297  	}
   298  
   299  	// First pass: break into block structure and collect known links.
   300  	// The text is all recorded as Plain for now.
   301  	var prev span
   302  	for _, s := range parseSpans(lines) {
   303  		var b Block
   304  		switch s.kind {
   305  		default:
   306  			panic("go/doc/comment: internal error: unknown span kind")
   307  		case spanList:
   308  			b = d.list(lines[s.start:s.end], prev.end < s.start)
   309  		case spanCode:
   310  			b = d.code(lines[s.start:s.end])
   311  		case spanOldHeading:
   312  			b = d.oldHeading(lines[s.start])
   313  		case spanHeading:
   314  			b = d.heading(lines[s.start])
   315  		case spanPara:
   316  			b = d.paragraph(lines[s.start:s.end])
   317  		}
   318  		if b != nil {
   319  			d.Content = append(d.Content, b)
   320  		}
   321  		prev = s
   322  	}
   323  
   324  	// Second pass: interpret all the Plain text now that we know the links.
   325  	for _, b := range d.Content {
   326  		switch b := b.(type) {
   327  		case *Paragraph:
   328  			b.Text = d.parseLinkedText(string(b.Text[0].(Plain)))
   329  		case *List:
   330  			for _, i := range b.Items {
   331  				for _, c := range i.Content {
   332  					p := c.(*Paragraph)
   333  					p.Text = d.parseLinkedText(string(p.Text[0].(Plain)))
   334  				}
   335  			}
   336  		}
   337  	}
   338  
   339  	return d.Doc
   340  }
   341  
   342  // A span represents a single span of comment lines (lines[start:end])
   343  // of an identified kind (code, heading, paragraph, and so on).
   344  type span struct {
   345  	start int
   346  	end   int
   347  	kind  spanKind
   348  }
   349  
   350  // A spanKind describes the kind of span.
   351  type spanKind int
   352  
   353  const (
   354  	_ spanKind = iota
   355  	spanCode
   356  	spanHeading
   357  	spanList
   358  	spanOldHeading
   359  	spanPara
   360  )
   361  
   362  func parseSpans(lines []string) []span {
   363  	var spans []span
   364  
   365  	// The loop may process a line twice: once as unindented
   366  	// and again forced indented. So the maximum expected
   367  	// number of iterations is 2*len(lines). The repeating logic
   368  	// can be subtle, though, and to protect against introduction
   369  	// of infinite loops in future changes, we watch to see that
   370  	// we are not looping too much. A panic is better than a
   371  	// quiet infinite loop.
   372  	watchdog := 2 * len(lines)
   373  
   374  	i := 0
   375  	forceIndent := 0
   376  Spans:
   377  	for {
   378  		// Skip blank lines.
   379  		for i < len(lines) && lines[i] == "" {
   380  			i++
   381  		}
   382  		if i >= len(lines) {
   383  			break
   384  		}
   385  		if watchdog--; watchdog < 0 {
   386  			panic("go/doc/comment: internal error: not making progress")
   387  		}
   388  
   389  		var kind spanKind
   390  		start := i
   391  		end := i
   392  		if i < forceIndent || indented(lines[i]) {
   393  			// Indented (or force indented).
   394  			// Ends before next unindented. (Blank lines are OK.)
   395  			// If this is an unindented list that we are heuristically treating as indented,
   396  			// then accept unindented list item lines up to the first blank lines.
   397  			// The heuristic is disabled at blank lines to contain its effect
   398  			// to non-gofmt'ed sections of the comment.
   399  			unindentedListOK := isList(lines[i]) && i < forceIndent
   400  			i++
   401  			for i < len(lines) && (lines[i] == "" || i < forceIndent || indented(lines[i]) || (unindentedListOK && isList(lines[i]))) {
   402  				if lines[i] == "" {
   403  					unindentedListOK = false
   404  				}
   405  				i++
   406  			}
   407  
   408  			// Drop trailing blank lines.
   409  			end = i
   410  			for end > start && lines[end-1] == "" {
   411  				end--
   412  			}
   413  
   414  			// If indented lines are followed (without a blank line)
   415  			// by an unindented line ending in a brace,
   416  			// take that one line too. This fixes the common mistake
   417  			// of pasting in something like
   418  			//
   419  			// func main() {
   420  			//	fmt.Println("hello, world")
   421  			// }
   422  			//
   423  			// and forgetting to indent it.
   424  			// The heuristic will never trigger on a gofmt'ed comment,
   425  			// because any gofmt'ed code block or list would be
   426  			// followed by a blank line or end of comment.
   427  			if end < len(lines) && strings.HasPrefix(lines[end], "}") {
   428  				end++
   429  			}
   430  
   431  			if isList(lines[start]) {
   432  				kind = spanList
   433  			} else {
   434  				kind = spanCode
   435  			}
   436  		} else {
   437  			// Unindented. Ends at next blank or indented line.
   438  			i++
   439  			for i < len(lines) && lines[i] != "" && !indented(lines[i]) {
   440  				i++
   441  			}
   442  			end = i
   443  
   444  			// If unindented lines are followed (without a blank line)
   445  			// by an indented line that would start a code block,
   446  			// check whether the final unindented lines
   447  			// should be left for the indented section.
   448  			// This can happen for the common mistakes of
   449  			// unindented code or unindented lists.
   450  			// The heuristic will never trigger on a gofmt'ed comment,
   451  			// because any gofmt'ed code block would have a blank line
   452  			// preceding it after the unindented lines.
   453  			if i < len(lines) && lines[i] != "" && !isList(lines[i]) {
   454  				switch {
   455  				case isList(lines[i-1]):
   456  					// If the final unindented line looks like a list item,
   457  					// this may be the first indented line wrap of
   458  					// a mistakenly unindented list.
   459  					// Leave all the unindented list items.
   460  					forceIndent = end
   461  					end--
   462  					for end > start && isList(lines[end-1]) {
   463  						end--
   464  					}
   465  
   466  				case strings.HasSuffix(lines[i-1], "{") || strings.HasSuffix(lines[i-1], `\`):
   467  					// If the final unindented line ended in { or \
   468  					// it is probably the start of a misindented code block.
   469  					// Give the user a single line fix.
   470  					// Often that's enough; if not, the user can fix the others themselves.
   471  					forceIndent = end
   472  					end--
   473  				}
   474  
   475  				if start == end && forceIndent > start {
   476  					i = start
   477  					continue Spans
   478  				}
   479  			}
   480  
   481  			// Span is either paragraph or heading.
   482  			if end-start == 1 && isHeading(lines[start]) {
   483  				kind = spanHeading
   484  			} else if end-start == 1 && isOldHeading(lines[start], lines, start) {
   485  				kind = spanOldHeading
   486  			} else {
   487  				kind = spanPara
   488  			}
   489  		}
   490  
   491  		spans = append(spans, span{start, end, kind})
   492  		i = end
   493  	}
   494  
   495  	return spans
   496  }
   497  
   498  // indented reports whether line is indented
   499  // (starts with a leading space or tab).
   500  func indented(line string) bool {
   501  	return line != "" && (line[0] == ' ' || line[0] == '\t')
   502  }
   503  
   504  // unindent removes any common space/tab prefix
   505  // from each line in lines, returning a copy of lines in which
   506  // those prefixes have been trimmed from each line.
   507  // It also replaces any lines containing only spaces with blank lines (empty strings).
   508  func unindent(lines []string) []string {
   509  	// Trim leading and trailing blank lines.
   510  	for len(lines) > 0 && isBlank(lines[0]) {
   511  		lines = lines[1:]
   512  	}
   513  	for len(lines) > 0 && isBlank(lines[len(lines)-1]) {
   514  		lines = lines[:len(lines)-1]
   515  	}
   516  	if len(lines) == 0 {
   517  		return nil
   518  	}
   519  
   520  	// Compute and remove common indentation.
   521  	prefix := leadingSpace(lines[0])
   522  	for _, line := range lines[1:] {
   523  		if !isBlank(line) {
   524  			prefix = commonPrefix(prefix, leadingSpace(line))
   525  		}
   526  	}
   527  
   528  	out := make([]string, len(lines))
   529  	for i, line := range lines {
   530  		line = strings.TrimPrefix(line, prefix)
   531  		if strings.TrimSpace(line) == "" {
   532  			line = ""
   533  		}
   534  		out[i] = line
   535  	}
   536  	for len(out) > 0 && out[0] == "" {
   537  		out = out[1:]
   538  	}
   539  	for len(out) > 0 && out[len(out)-1] == "" {
   540  		out = out[:len(out)-1]
   541  	}
   542  	return out
   543  }
   544  
   545  // isBlank reports whether s is a blank line.
   546  func isBlank(s string) bool {
   547  	return len(s) == 0 || (len(s) == 1 && s[0] == '\n')
   548  }
   549  
   550  // commonPrefix returns the longest common prefix of a and b.
   551  func commonPrefix(a, b string) string {
   552  	i := 0
   553  	for i < len(a) && i < len(b) && a[i] == b[i] {
   554  		i++
   555  	}
   556  	return a[0:i]
   557  }
   558  
   559  // leadingSpace returns the longest prefix of s consisting of spaces and tabs.
   560  func leadingSpace(s string) string {
   561  	i := 0
   562  	for i < len(s) && (s[i] == ' ' || s[i] == '\t') {
   563  		i++
   564  	}
   565  	return s[:i]
   566  }
   567  
   568  // isOldHeading reports whether line is an old-style section heading.
   569  // line is all[off].
   570  func isOldHeading(line string, all []string, off int) bool {
   571  	if off <= 0 || all[off-1] != "" || off+2 >= len(all) || all[off+1] != "" || leadingSpace(all[off+2]) != "" {
   572  		return false
   573  	}
   574  
   575  	line = strings.TrimSpace(line)
   576  
   577  	// a heading must start with an uppercase letter
   578  	r, _ := utf8.DecodeRuneInString(line)
   579  	if !unicode.IsLetter(r) || !unicode.IsUpper(r) {
   580  		return false
   581  	}
   582  
   583  	// it must end in a letter or digit:
   584  	r, _ = utf8.DecodeLastRuneInString(line)
   585  	if !unicode.IsLetter(r) && !unicode.IsDigit(r) {
   586  		return false
   587  	}
   588  
   589  	// exclude lines with illegal characters. we allow "(),"
   590  	if strings.ContainsAny(line, ";:!?+*/=[]{}_^°&§~%#@<\">\\") {
   591  		return false
   592  	}
   593  
   594  	// allow "'" for possessive "'s" only
   595  	for b := line; ; {
   596  		var ok bool
   597  		if _, b, ok = strings.Cut(b, "'"); !ok {
   598  			break
   599  		}
   600  		if b != "s" && !strings.HasPrefix(b, "s ") {
   601  			return false // ' not followed by s and then end-of-word
   602  		}
   603  	}
   604  
   605  	// allow "." when followed by non-space
   606  	for b := line; ; {
   607  		var ok bool
   608  		if _, b, ok = strings.Cut(b, "."); !ok {
   609  			break
   610  		}
   611  		if b == "" || strings.HasPrefix(b, " ") {
   612  			return false // not followed by non-space
   613  		}
   614  	}
   615  
   616  	return true
   617  }
   618  
   619  // oldHeading returns the *Heading for the given old-style section heading line.
   620  func (d *parseDoc) oldHeading(line string) Block {
   621  	return &Heading{Text: []Text{Plain(strings.TrimSpace(line))}}
   622  }
   623  
   624  // isHeading reports whether line is a new-style section heading.
   625  func isHeading(line string) bool {
   626  	return len(line) >= 2 &&
   627  		line[0] == '#' &&
   628  		(line[1] == ' ' || line[1] == '\t') &&
   629  		strings.TrimSpace(line) != "#"
   630  }
   631  
   632  // heading returns the *Heading for the given new-style section heading line.
   633  func (d *parseDoc) heading(line string) Block {
   634  	return &Heading{Text: []Text{Plain(strings.TrimSpace(line[1:]))}}
   635  }
   636  
   637  // code returns a code block built from the lines.
   638  func (d *parseDoc) code(lines []string) *Code {
   639  	body := unindent(lines)
   640  	body = append(body, "") // to get final \n from Join
   641  	return &Code{Text: strings.Join(body, "\n")}
   642  }
   643  
   644  // paragraph returns a paragraph block built from the lines.
   645  // If the lines are link definitions, paragraph adds them to d and returns nil.
   646  func (d *parseDoc) paragraph(lines []string) Block {
   647  	// Is this a block of known links? Handle.
   648  	var defs []*LinkDef
   649  	for _, line := range lines {
   650  		def, ok := parseLink(line)
   651  		if !ok {
   652  			goto NoDefs
   653  		}
   654  		defs = append(defs, def)
   655  	}
   656  	for _, def := range defs {
   657  		d.Links = append(d.Links, def)
   658  		if d.links[def.Text] == nil {
   659  			d.links[def.Text] = def
   660  		}
   661  	}
   662  	return nil
   663  NoDefs:
   664  
   665  	return &Paragraph{Text: []Text{Plain(strings.Join(lines, "\n"))}}
   666  }
   667  
   668  // parseLink parses a single link definition line:
   669  //
   670  //	[text]: url
   671  //
   672  // It returns the link definition and whether the line was well formed.
   673  func parseLink(line string) (*LinkDef, bool) {
   674  	if line == "" || line[0] != '[' {
   675  		return nil, false
   676  	}
   677  	i := strings.Index(line, "]:")
   678  	if i < 0 || i+3 >= len(line) || (line[i+2] != ' ' && line[i+2] != '\t') {
   679  		return nil, false
   680  	}
   681  
   682  	text := line[1:i]
   683  	url := strings.TrimSpace(line[i+3:])
   684  	j := strings.Index(url, "://")
   685  	if j < 0 || !isScheme(url[:j]) {
   686  		return nil, false
   687  	}
   688  
   689  	// Line has right form and has valid scheme://.
   690  	// That's good enough for us - we are not as picky
   691  	// about the characters beyond the :// as we are
   692  	// when extracting inline URLs from text.
   693  	return &LinkDef{Text: text, URL: url}, true
   694  }
   695  
   696  // list returns a list built from the indented lines,
   697  // using forceBlankBefore as the value of the List's ForceBlankBefore field.
   698  func (d *parseDoc) list(lines []string, forceBlankBefore bool) *List {
   699  	num, _, _ := listMarker(lines[0])
   700  	var (
   701  		list *List = &List{ForceBlankBefore: forceBlankBefore}
   702  		item *ListItem
   703  		text []string
   704  	)
   705  	flush := func() {
   706  		if item != nil {
   707  			if para := d.paragraph(text); para != nil {
   708  				item.Content = append(item.Content, para)
   709  			}
   710  		}
   711  		text = nil
   712  	}
   713  
   714  	for _, line := range lines {
   715  		if n, after, ok := listMarker(line); ok && (n != "") == (num != "") {
   716  			// start new list item
   717  			flush()
   718  
   719  			item = &ListItem{Number: n}
   720  			list.Items = append(list.Items, item)
   721  			line = after
   722  		}
   723  		line = strings.TrimSpace(line)
   724  		if line == "" {
   725  			list.ForceBlankBetween = true
   726  			flush()
   727  			continue
   728  		}
   729  		text = append(text, strings.TrimSpace(line))
   730  	}
   731  	flush()
   732  	return list
   733  }
   734  
   735  // listMarker parses the line as beginning with a list marker.
   736  // If it can do that, it returns the numeric marker ("" for a bullet list),
   737  // the rest of the line, and ok == true.
   738  // Otherwise, it returns "", "", false.
   739  func listMarker(line string) (num, rest string, ok bool) {
   740  	line = strings.TrimSpace(line)
   741  	if line == "" {
   742  		return "", "", false
   743  	}
   744  
   745  	// Can we find a marker?
   746  	if r, n := utf8.DecodeRuneInString(line); r == '•' || r == '*' || r == '+' || r == '-' {
   747  		num, rest = "", line[n:]
   748  	} else if '0' <= line[0] && line[0] <= '9' {
   749  		n := 1
   750  		for n < len(line) && '0' <= line[n] && line[n] <= '9' {
   751  			n++
   752  		}
   753  		if n >= len(line) || (line[n] != '.' && line[n] != ')') {
   754  			return "", "", false
   755  		}
   756  		num, rest = line[:n], line[n+1:]
   757  	} else {
   758  		return "", "", false
   759  	}
   760  
   761  	if !indented(rest) || strings.TrimSpace(rest) == "" {
   762  		return "", "", false
   763  	}
   764  
   765  	return num, rest, true
   766  }
   767  
   768  // isList reports whether the line is the first line of a list,
   769  // meaning starts with a list marker after any indentation.
   770  // (The caller is responsible for checking the line is indented, as appropriate.)
   771  func isList(line string) bool {
   772  	_, _, ok := listMarker(line)
   773  	return ok
   774  }
   775  
   776  // parseLinkedText parses text that is allowed to contain explicit links,
   777  // such as [math.Sin] or [Go home page], into a slice of Text items.
   778  //
   779  // A “pkg” is only assumed to be a full import path if it starts with
   780  // a domain name (a path element with a dot) or is one of the packages
   781  // from the standard library (“[os]”, “[encoding/json]”, and so on).
   782  // To avoid problems with maps, generics, and array types, doc links
   783  // must be both preceded and followed by punctuation, spaces, tabs,
   784  // or the start or end of a line. An example problem would be treating
   785  // map[ast.Expr]TypeAndValue as containing a link.
   786  func (d *parseDoc) parseLinkedText(text string) []Text {
   787  	var out []Text
   788  	wrote := 0
   789  	flush := func(i int) {
   790  		if wrote < i {
   791  			out = d.parseText(out, text[wrote:i], true)
   792  			wrote = i
   793  		}
   794  	}
   795  
   796  	start := -1
   797  	var buf []byte
   798  	for i := 0; i < len(text); i++ {
   799  		c := text[i]
   800  		if c == '\n' || c == '\t' {
   801  			c = ' '
   802  		}
   803  		switch c {
   804  		case '[':
   805  			start = i
   806  		case ']':
   807  			if start >= 0 {
   808  				if def, ok := d.links[string(buf)]; ok {
   809  					def.Used = true
   810  					flush(start)
   811  					out = append(out, &Link{
   812  						Text: d.parseText(nil, text[start+1:i], false),
   813  						URL:  def.URL,
   814  					})
   815  					wrote = i + 1
   816  				} else if link, ok := d.docLink(text[start+1:i], text[:start], text[i+1:]); ok {
   817  					flush(start)
   818  					link.Text = d.parseText(nil, text[start+1:i], false)
   819  					out = append(out, link)
   820  					wrote = i + 1
   821  				}
   822  			}
   823  			start = -1
   824  			buf = buf[:0]
   825  		}
   826  		if start >= 0 && i != start {
   827  			buf = append(buf, c)
   828  		}
   829  	}
   830  
   831  	flush(len(text))
   832  	return out
   833  }
   834  
   835  // docLink parses text, which was found inside [ ] brackets,
   836  // as a doc link if possible, returning the DocLink and ok == true
   837  // or else nil, false.
   838  // The before and after strings are the text before the [ and after the ]
   839  // on the same line. Doc links must be preceded and followed by
   840  // punctuation, spaces, tabs, or the start or end of a line.
   841  func (d *parseDoc) docLink(text, before, after string) (link *DocLink, ok bool) {
   842  	if before != "" {
   843  		r, _ := utf8.DecodeLastRuneInString(before)
   844  		if !unicode.IsPunct(r) && r != ' ' && r != '\t' && r != '\n' {
   845  			return nil, false
   846  		}
   847  	}
   848  	if after != "" {
   849  		r, _ := utf8.DecodeRuneInString(after)
   850  		if !unicode.IsPunct(r) && r != ' ' && r != '\t' && r != '\n' {
   851  			return nil, false
   852  		}
   853  	}
   854  	if strings.HasPrefix(text, "*") {
   855  		text = text[1:]
   856  	}
   857  	pkg, name, ok := splitDocName(text)
   858  	var recv string
   859  	if ok {
   860  		pkg, recv, _ = splitDocName(pkg)
   861  	}
   862  	if pkg != "" {
   863  		if pkg, ok = d.lookupPkg(pkg); !ok {
   864  			return nil, false
   865  		}
   866  	} else {
   867  		if ok = d.lookupSym(recv, name); !ok {
   868  			return nil, false
   869  		}
   870  	}
   871  	link = &DocLink{
   872  		ImportPath: pkg,
   873  		Recv:       recv,
   874  		Name:       name,
   875  	}
   876  	return link, true
   877  }
   878  
   879  // If text is of the form before.Name, where Name is a capitalized Go identifier,
   880  // then splitDocName returns before, name, true.
   881  // Otherwise it returns text, "", false.
   882  func splitDocName(text string) (before, name string, foundDot bool) {
   883  	i := strings.LastIndex(text, ".")
   884  	name = text[i+1:]
   885  	if !isName(name) {
   886  		return text, "", false
   887  	}
   888  	if i >= 0 {
   889  		before = text[:i]
   890  	}
   891  	return before, name, true
   892  }
   893  
   894  // parseText parses s as text and returns the result of appending
   895  // those parsed Text elements to out.
   896  // parseText does not handle explicit links like [math.Sin] or [Go home page]:
   897  // those are handled by parseLinkedText.
   898  // If autoLink is true, then parseText recognizes URLs and words from d.Words
   899  // and converts those to links as appropriate.
   900  func (d *parseDoc) parseText(out []Text, s string, autoLink bool) []Text {
   901  	var w strings.Builder
   902  	wrote := 0
   903  	writeUntil := func(i int) {
   904  		w.WriteString(s[wrote:i])
   905  		wrote = i
   906  	}
   907  	flush := func(i int) {
   908  		writeUntil(i)
   909  		if w.Len() > 0 {
   910  			out = append(out, Plain(w.String()))
   911  			w.Reset()
   912  		}
   913  	}
   914  	for i := 0; i < len(s); {
   915  		t := s[i:]
   916  		if autoLink {
   917  			if url, ok := autoURL(t); ok {
   918  				flush(i)
   919  				// Note: The old comment parser would look up the URL in words
   920  				// and replace the target with words[URL] if it was non-empty.
   921  				// That would allow creating links that display as one URL but
   922  				// when clicked go to a different URL. Not sure what the point
   923  				// of that is, so we're not doing that lookup here.
   924  				out = append(out, &Link{Auto: true, Text: []Text{Plain(url)}, URL: url})
   925  				i += len(url)
   926  				wrote = i
   927  				continue
   928  			}
   929  			if id, ok := ident(t); ok {
   930  				url, italics := d.Words[id]
   931  				if !italics {
   932  					i += len(id)
   933  					continue
   934  				}
   935  				flush(i)
   936  				if url == "" {
   937  					out = append(out, Italic(id))
   938  				} else {
   939  					out = append(out, &Link{Auto: true, Text: []Text{Italic(id)}, URL: url})
   940  				}
   941  				i += len(id)
   942  				wrote = i
   943  				continue
   944  			}
   945  		}
   946  		switch {
   947  		case strings.HasPrefix(t, "``"):
   948  			if len(t) >= 3 && t[2] == '`' {
   949  				// Do not convert `` inside ```, in case people are mistakenly writing Markdown.
   950  				i += 3
   951  				for i < len(t) && t[i] == '`' {
   952  					i++
   953  				}
   954  				break
   955  			}
   956  			writeUntil(i)
   957  			w.WriteRune('“')
   958  			i += 2
   959  			wrote = i
   960  		case strings.HasPrefix(t, "''"):
   961  			writeUntil(i)
   962  			w.WriteRune('”')
   963  			i += 2
   964  			wrote = i
   965  		default:
   966  			i++
   967  		}
   968  	}
   969  	flush(len(s))
   970  	return out
   971  }
   972  
   973  // autoURL checks whether s begins with a URL that should be hyperlinked.
   974  // If so, it returns the URL, which is a prefix of s, and ok == true.
   975  // Otherwise it returns "", false.
   976  // The caller should skip over the first len(url) bytes of s
   977  // before further processing.
   978  func autoURL(s string) (url string, ok bool) {
   979  	// Find the ://. Fast path to pick off non-URL,
   980  	// since we call this at every position in the string.
   981  	// The shortest possible URL is ftp://x, 7 bytes.
   982  	var i int
   983  	switch {
   984  	case len(s) < 7:
   985  		return "", false
   986  	case s[3] == ':':
   987  		i = 3
   988  	case s[4] == ':':
   989  		i = 4
   990  	case s[5] == ':':
   991  		i = 5
   992  	case s[6] == ':':
   993  		i = 6
   994  	default:
   995  		return "", false
   996  	}
   997  	if i+3 > len(s) || s[i:i+3] != "://" {
   998  		return "", false
   999  	}
  1000  
  1001  	// Check valid scheme.
  1002  	if !isScheme(s[:i]) {
  1003  		return "", false
  1004  	}
  1005  
  1006  	// Scan host part. Must have at least one byte,
  1007  	// and must start and end in non-punctuation.
  1008  	i += 3
  1009  	if i >= len(s) || !isHost(s[i]) || isPunct(s[i]) {
  1010  		return "", false
  1011  	}
  1012  	i++
  1013  	end := i
  1014  	for i < len(s) && isHost(s[i]) {
  1015  		if !isPunct(s[i]) {
  1016  			end = i + 1
  1017  		}
  1018  		i++
  1019  	}
  1020  	i = end
  1021  
  1022  	// At this point we are definitely returning a URL (scheme://host).
  1023  	// We just have to find the longest path we can add to it.
  1024  	// Heuristics abound.
  1025  	// We allow parens, braces, and brackets,
  1026  	// but only if they match (#5043, #22285).
  1027  	// We allow .,:;?! in the path but not at the end,
  1028  	// to avoid end-of-sentence punctuation (#18139, #16565).
  1029  	stk := []byte{}
  1030  	end = i
  1031  Path:
  1032  	for ; i < len(s); i++ {
  1033  		if isPunct(s[i]) {
  1034  			continue
  1035  		}
  1036  		if !isPath(s[i]) {
  1037  			break
  1038  		}
  1039  		switch s[i] {
  1040  		case '(':
  1041  			stk = append(stk, ')')
  1042  		case '{':
  1043  			stk = append(stk, '}')
  1044  		case '[':
  1045  			stk = append(stk, ']')
  1046  		case ')', '}', ']':
  1047  			if len(stk) == 0 || stk[len(stk)-1] != s[i] {
  1048  				break Path
  1049  			}
  1050  			stk = stk[:len(stk)-1]
  1051  		}
  1052  		if len(stk) == 0 {
  1053  			end = i + 1
  1054  		}
  1055  	}
  1056  
  1057  	return s[:end], true
  1058  }
  1059  
  1060  // isScheme reports whether s is a recognized URL scheme.
  1061  // Note that if strings of new length (beyond 3-7)
  1062  // are added here, the fast path at the top of autoURL will need updating.
  1063  func isScheme(s string) bool {
  1064  	switch s {
  1065  	case "file",
  1066  		"ftp",
  1067  		"gopher",
  1068  		"http",
  1069  		"https",
  1070  		"mailto",
  1071  		"nntp":
  1072  		return true
  1073  	}
  1074  	return false
  1075  }
  1076  
  1077  // isHost reports whether c is a byte that can appear in a URL host,
  1078  // like www.example.com or user@[::1]:8080
  1079  func isHost(c byte) bool {
  1080  	// mask is a 128-bit bitmap with 1s for allowed bytes,
  1081  	// so that the byte c can be tested with a shift and an and.
  1082  	// If c > 128, then 1<<c and 1<<(c-64) will both be zero,
  1083  	// and this function will return false.
  1084  	const mask = 0 |
  1085  		(1<<26-1)<<'A' |
  1086  		(1<<26-1)<<'a' |
  1087  		(1<<10-1)<<'0' |
  1088  		1<<'_' |
  1089  		1<<'@' |
  1090  		1<<'-' |
  1091  		1<<'.' |
  1092  		1<<'[' |
  1093  		1<<']' |
  1094  		1<<':'
  1095  
  1096  	return ((uint64(1)<<c)&(mask&(1<<64-1)) |
  1097  		(uint64(1)<<(c-64))&(mask>>64)) != 0
  1098  }
  1099  
  1100  // isPunct reports whether c is a punctuation byte that can appear
  1101  // inside a path but not at the end.
  1102  func isPunct(c byte) bool {
  1103  	// mask is a 128-bit bitmap with 1s for allowed bytes,
  1104  	// so that the byte c can be tested with a shift and an and.
  1105  	// If c > 128, then 1<<c and 1<<(c-64) will both be zero,
  1106  	// and this function will return false.
  1107  	const mask = 0 |
  1108  		1<<'.' |
  1109  		1<<',' |
  1110  		1<<':' |
  1111  		1<<';' |
  1112  		1<<'?' |
  1113  		1<<'!'
  1114  
  1115  	return ((uint64(1)<<c)&(mask&(1<<64-1)) |
  1116  		(uint64(1)<<(c-64))&(mask>>64)) != 0
  1117  }
  1118  
  1119  // isPath reports whether c is a (non-punctuation) path byte.
  1120  func isPath(c byte) bool {
  1121  	// mask is a 128-bit bitmap with 1s for allowed bytes,
  1122  	// so that the byte c can be tested with a shift and an and.
  1123  	// If c > 128, then 1<<c and 1<<(c-64) will both be zero,
  1124  	// and this function will return false.
  1125  	const mask = 0 |
  1126  		(1<<26-1)<<'A' |
  1127  		(1<<26-1)<<'a' |
  1128  		(1<<10-1)<<'0' |
  1129  		1<<'$' |
  1130  		1<<'\'' |
  1131  		1<<'(' |
  1132  		1<<')' |
  1133  		1<<'*' |
  1134  		1<<'+' |
  1135  		1<<'&' |
  1136  		1<<'#' |
  1137  		1<<'=' |
  1138  		1<<'@' |
  1139  		1<<'~' |
  1140  		1<<'_' |
  1141  		1<<'/' |
  1142  		1<<'-' |
  1143  		1<<'[' |
  1144  		1<<']' |
  1145  		1<<'{' |
  1146  		1<<'}' |
  1147  		1<<'%'
  1148  
  1149  	return ((uint64(1)<<c)&(mask&(1<<64-1)) |
  1150  		(uint64(1)<<(c-64))&(mask>>64)) != 0
  1151  }
  1152  
  1153  // isName reports whether s is a capitalized Go identifier (like Name).
  1154  func isName(s string) bool {
  1155  	t, ok := ident(s)
  1156  	if !ok || t != s {
  1157  		return false
  1158  	}
  1159  	r, _ := utf8.DecodeRuneInString(s)
  1160  	return unicode.IsUpper(r)
  1161  }
  1162  
  1163  // ident checks whether s begins with a Go identifier.
  1164  // If so, it returns the identifier, which is a prefix of s, and ok == true.
  1165  // Otherwise it returns "", false.
  1166  // The caller should skip over the first len(id) bytes of s
  1167  // before further processing.
  1168  func ident(s string) (id string, ok bool) {
  1169  	// Scan [\pL_][\pL_0-9]*
  1170  	n := 0
  1171  	for n < len(s) {
  1172  		if c := s[n]; c < utf8.RuneSelf {
  1173  			if isIdentASCII(c) && (n > 0 || c < '0' || c > '9') {
  1174  				n++
  1175  				continue
  1176  			}
  1177  			break
  1178  		}
  1179  		r, nr := utf8.DecodeRuneInString(s[n:])
  1180  		if unicode.IsLetter(r) {
  1181  			n += nr
  1182  			continue
  1183  		}
  1184  		break
  1185  	}
  1186  	return s[:n], n > 0
  1187  }
  1188  
  1189  // isIdentASCII reports whether c is an ASCII identifier byte.
  1190  func isIdentASCII(c byte) bool {
  1191  	// mask is a 128-bit bitmap with 1s for allowed bytes,
  1192  	// so that the byte c can be tested with a shift and an and.
  1193  	// If c > 128, then 1<<c and 1<<(c-64) will both be zero,
  1194  	// and this function will return false.
  1195  	const mask = 0 |
  1196  		(1<<26-1)<<'A' |
  1197  		(1<<26-1)<<'a' |
  1198  		(1<<10-1)<<'0' |
  1199  		1<<'_'
  1200  
  1201  	return ((uint64(1)<<c)&(mask&(1<<64-1)) |
  1202  		(uint64(1)<<(c-64))&(mask>>64)) != 0
  1203  }
  1204  
  1205  // validImportPath reports whether path is a valid import path.
  1206  // It is a lightly edited copy of golang.org/x/mod/module.CheckImportPath.
  1207  func validImportPath(path string) bool {
  1208  	if !utf8.ValidString(path) {
  1209  		return false
  1210  	}
  1211  	if path == "" {
  1212  		return false
  1213  	}
  1214  	if path[0] == '-' {
  1215  		return false
  1216  	}
  1217  	if strings.Contains(path, "//") {
  1218  		return false
  1219  	}
  1220  	if path[len(path)-1] == '/' {
  1221  		return false
  1222  	}
  1223  	elemStart := 0
  1224  	for i, r := range path {
  1225  		if r == '/' {
  1226  			if !validImportPathElem(path[elemStart:i]) {
  1227  				return false
  1228  			}
  1229  			elemStart = i + 1
  1230  		}
  1231  	}
  1232  	return validImportPathElem(path[elemStart:])
  1233  }
  1234  
  1235  func validImportPathElem(elem string) bool {
  1236  	if elem == "" || elem[0] == '.' || elem[len(elem)-1] == '.' {
  1237  		return false
  1238  	}
  1239  	for i := 0; i < len(elem); i++ {
  1240  		if !importPathOK(elem[i]) {
  1241  			return false
  1242  		}
  1243  	}
  1244  	return true
  1245  }
  1246  
  1247  func importPathOK(c byte) bool {
  1248  	// mask is a 128-bit bitmap with 1s for allowed bytes,
  1249  	// so that the byte c can be tested with a shift and an and.
  1250  	// If c > 128, then 1<<c and 1<<(c-64) will both be zero,
  1251  	// and this function will return false.
  1252  	const mask = 0 |
  1253  		(1<<26-1)<<'A' |
  1254  		(1<<26-1)<<'a' |
  1255  		(1<<10-1)<<'0' |
  1256  		1<<'-' |
  1257  		1<<'.' |
  1258  		1<<'~' |
  1259  		1<<'_' |
  1260  		1<<'+'
  1261  
  1262  	return ((uint64(1)<<c)&(mask&(1<<64-1)) |
  1263  		(uint64(1)<<(c-64))&(mask>>64)) != 0
  1264  }
  1265  

View as plain text