1 package parser
2
3 import (
4 "bytes"
5 "regexp"
6 "strings"
7
8 "github.com/yuin/goldmark/ast"
9 "github.com/yuin/goldmark/text"
10 "github.com/yuin/goldmark/util"
11 )
12
13 var allowedBlockTags = map[string]bool{
14 "address": true,
15 "article": true,
16 "aside": true,
17 "base": true,
18 "basefont": true,
19 "blockquote": true,
20 "body": true,
21 "caption": true,
22 "center": true,
23 "col": true,
24 "colgroup": true,
25 "dd": true,
26 "details": true,
27 "dialog": true,
28 "dir": true,
29 "div": true,
30 "dl": true,
31 "dt": true,
32 "fieldset": true,
33 "figcaption": true,
34 "figure": true,
35 "footer": true,
36 "form": true,
37 "frame": true,
38 "frameset": true,
39 "h1": true,
40 "h2": true,
41 "h3": true,
42 "h4": true,
43 "h5": true,
44 "h6": true,
45 "head": true,
46 "header": true,
47 "hr": true,
48 "html": true,
49 "iframe": true,
50 "legend": true,
51 "li": true,
52 "link": true,
53 "main": true,
54 "menu": true,
55 "menuitem": true,
56 "meta": true,
57 "nav": true,
58 "noframes": true,
59 "ol": true,
60 "optgroup": true,
61 "option": true,
62 "p": true,
63 "param": true,
64 "section": true,
65 "source": true,
66 "summary": true,
67 "table": true,
68 "tbody": true,
69 "td": true,
70 "tfoot": true,
71 "th": true,
72 "thead": true,
73 "title": true,
74 "tr": true,
75 "track": true,
76 "ul": true,
77 }
78
79 var htmlBlockType1OpenRegexp = regexp.MustCompile(`(?i)^[ ]{0,3}<(script|pre|style|textarea)(?:\s.*|>.*|/>.*|)(?:\r\n|\n)?$`)
80 var htmlBlockType1CloseRegexp = regexp.MustCompile(`(?i)^.*</(?:script|pre|style|textarea)>.*`)
81
82 var htmlBlockType2OpenRegexp = regexp.MustCompile(`^[ ]{0,3}<!\-\-`)
83 var htmlBlockType2Close = []byte{'-', '-', '>'}
84
85 var htmlBlockType3OpenRegexp = regexp.MustCompile(`^[ ]{0,3}<\?`)
86 var htmlBlockType3Close = []byte{'?', '>'}
87
88 var htmlBlockType4OpenRegexp = regexp.MustCompile(`^[ ]{0,3}<![A-Z]+.*(?:\r\n|\n)?$`)
89 var htmlBlockType4Close = []byte{'>'}
90
91 var htmlBlockType5OpenRegexp = regexp.MustCompile(`^[ ]{0,3}<\!\[CDATA\[`)
92 var htmlBlockType5Close = []byte{']', ']', '>'}
93
94 var htmlBlockType6Regexp = regexp.MustCompile(`^[ ]{0,3}<(?:/[ ]*)?([a-zA-Z]+[a-zA-Z0-9\-]*)(?:[ ].*|>.*|/>.*|)(?:\r\n|\n)?$`)
95
96 var htmlBlockType7Regexp = regexp.MustCompile(`^[ ]{0,3}<(/[ ]*)?([a-zA-Z]+[a-zA-Z0-9\-]*)(` + attributePattern + `*)[ ]*(?:>|/>)[ ]*(?:\r\n|\n)?$`)
97
98 type htmlBlockParser struct {
99 }
100
101 var defaultHTMLBlockParser = &htmlBlockParser{}
102
103
104
105 func NewHTMLBlockParser() BlockParser {
106 return defaultHTMLBlockParser
107 }
108
109 func (b *htmlBlockParser) Trigger() []byte {
110 return []byte{'<'}
111 }
112
113 func (b *htmlBlockParser) Open(parent ast.Node, reader text.Reader, pc Context) (ast.Node, State) {
114 var node *ast.HTMLBlock
115 line, segment := reader.PeekLine()
116 last := pc.LastOpenedBlock().Node
117 if pos := pc.BlockOffset(); pos < 0 || line[pos] != '<' {
118 return nil, NoChildren
119 }
120
121 if m := htmlBlockType1OpenRegexp.FindSubmatchIndex(line); m != nil {
122 node = ast.NewHTMLBlock(ast.HTMLBlockType1)
123 } else if htmlBlockType2OpenRegexp.Match(line) {
124 node = ast.NewHTMLBlock(ast.HTMLBlockType2)
125 } else if htmlBlockType3OpenRegexp.Match(line) {
126 node = ast.NewHTMLBlock(ast.HTMLBlockType3)
127 } else if htmlBlockType4OpenRegexp.Match(line) {
128 node = ast.NewHTMLBlock(ast.HTMLBlockType4)
129 } else if htmlBlockType5OpenRegexp.Match(line) {
130 node = ast.NewHTMLBlock(ast.HTMLBlockType5)
131 } else if match := htmlBlockType7Regexp.FindSubmatchIndex(line); match != nil {
132 isCloseTag := match[2] > -1 && bytes.Equal(line[match[2]:match[3]], []byte("/"))
133 hasAttr := match[6] != match[7]
134 tagName := strings.ToLower(string(line[match[4]:match[5]]))
135 _, ok := allowedBlockTags[tagName]
136 if ok {
137 node = ast.NewHTMLBlock(ast.HTMLBlockType6)
138 } else if tagName != "script" && tagName != "style" && tagName != "pre" && !ast.IsParagraph(last) && !(isCloseTag && hasAttr) {
139 node = ast.NewHTMLBlock(ast.HTMLBlockType7)
140 }
141 }
142 if node == nil {
143 if match := htmlBlockType6Regexp.FindSubmatchIndex(line); match != nil {
144 tagName := string(line[match[2]:match[3]])
145 _, ok := allowedBlockTags[strings.ToLower(tagName)]
146 if ok {
147 node = ast.NewHTMLBlock(ast.HTMLBlockType6)
148 }
149 }
150 }
151 if node != nil {
152 reader.Advance(segment.Len() - 1)
153 node.Lines().Append(segment)
154 return node, NoChildren
155 }
156 return nil, NoChildren
157 }
158
159 func (b *htmlBlockParser) Continue(node ast.Node, reader text.Reader, pc Context) State {
160 htmlBlock := node.(*ast.HTMLBlock)
161 lines := htmlBlock.Lines()
162 line, segment := reader.PeekLine()
163 var closurePattern []byte
164
165 switch htmlBlock.HTMLBlockType {
166 case ast.HTMLBlockType1:
167 if lines.Len() == 1 {
168 firstLine := lines.At(0)
169 if htmlBlockType1CloseRegexp.Match(firstLine.Value(reader.Source())) {
170 return Close
171 }
172 }
173 if htmlBlockType1CloseRegexp.Match(line) {
174 htmlBlock.ClosureLine = segment
175 reader.Advance(segment.Len() - 1)
176 return Close
177 }
178 case ast.HTMLBlockType2:
179 closurePattern = htmlBlockType2Close
180 fallthrough
181 case ast.HTMLBlockType3:
182 if closurePattern == nil {
183 closurePattern = htmlBlockType3Close
184 }
185 fallthrough
186 case ast.HTMLBlockType4:
187 if closurePattern == nil {
188 closurePattern = htmlBlockType4Close
189 }
190 fallthrough
191 case ast.HTMLBlockType5:
192 if closurePattern == nil {
193 closurePattern = htmlBlockType5Close
194 }
195
196 if lines.Len() == 1 {
197 firstLine := lines.At(0)
198 if bytes.Contains(firstLine.Value(reader.Source()), closurePattern) {
199 return Close
200 }
201 }
202 if bytes.Contains(line, closurePattern) {
203 htmlBlock.ClosureLine = segment
204 reader.Advance(segment.Len())
205 return Close
206 }
207
208 case ast.HTMLBlockType6, ast.HTMLBlockType7:
209 if util.IsBlank(line) {
210 return Close
211 }
212 }
213 node.Lines().Append(segment)
214 reader.Advance(segment.Len() - 1)
215 return Continue | NoChildren
216 }
217
218 func (b *htmlBlockParser) Close(node ast.Node, reader text.Reader, pc Context) {
219
220 }
221
222 func (b *htmlBlockParser) CanInterruptParagraph() bool {
223 return true
224 }
225
226 func (b *htmlBlockParser) CanAcceptIndentedLine() bool {
227 return false
228 }
229
View as plain text