...

Source file src/strconv/quote.go

Documentation: strconv

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:generate go run makeisprint.go -output isprint.go
     6  
     7  package strconv
     8  
     9  import (
    10  	"unicode/utf8"
    11  )
    12  
    13  const (
    14  	lowerhex = "0123456789abcdef"
    15  	upperhex = "0123456789ABCDEF"
    16  )
    17  
    18  // contains reports whether the string contains the byte c.
    19  func contains(s string, c byte) bool {
    20  	return index(s, c) != -1
    21  }
    22  
    23  func quoteWith(s string, quote byte, ASCIIonly, graphicOnly bool) string {
    24  	return string(appendQuotedWith(make([]byte, 0, 3*len(s)/2), s, quote, ASCIIonly, graphicOnly))
    25  }
    26  
    27  func quoteRuneWith(r rune, quote byte, ASCIIonly, graphicOnly bool) string {
    28  	return string(appendQuotedRuneWith(nil, r, quote, ASCIIonly, graphicOnly))
    29  }
    30  
    31  func appendQuotedWith(buf []byte, s string, quote byte, ASCIIonly, graphicOnly bool) []byte {
    32  	// Often called with big strings, so preallocate. If there's quoting,
    33  	// this is conservative but still helps a lot.
    34  	if cap(buf)-len(buf) < len(s) {
    35  		nBuf := make([]byte, len(buf), len(buf)+1+len(s)+1)
    36  		copy(nBuf, buf)
    37  		buf = nBuf
    38  	}
    39  	buf = append(buf, quote)
    40  	for width := 0; len(s) > 0; s = s[width:] {
    41  		r := rune(s[0])
    42  		width = 1
    43  		if r >= utf8.RuneSelf {
    44  			r, width = utf8.DecodeRuneInString(s)
    45  		}
    46  		if width == 1 && r == utf8.RuneError {
    47  			buf = append(buf, `\x`...)
    48  			buf = append(buf, lowerhex[s[0]>>4])
    49  			buf = append(buf, lowerhex[s[0]&0xF])
    50  			continue
    51  		}
    52  		buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
    53  	}
    54  	buf = append(buf, quote)
    55  	return buf
    56  }
    57  
    58  func appendQuotedRuneWith(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
    59  	buf = append(buf, quote)
    60  	if !utf8.ValidRune(r) {
    61  		r = utf8.RuneError
    62  	}
    63  	buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
    64  	buf = append(buf, quote)
    65  	return buf
    66  }
    67  
    68  func appendEscapedRune(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
    69  	var runeTmp [utf8.UTFMax]byte
    70  	if r == rune(quote) || r == '\\' { // always backslashed
    71  		buf = append(buf, '\\')
    72  		buf = append(buf, byte(r))
    73  		return buf
    74  	}
    75  	if ASCIIonly {
    76  		if r < utf8.RuneSelf && IsPrint(r) {
    77  			buf = append(buf, byte(r))
    78  			return buf
    79  		}
    80  	} else if IsPrint(r) || graphicOnly && isInGraphicList(r) {
    81  		n := utf8.EncodeRune(runeTmp[:], r)
    82  		buf = append(buf, runeTmp[:n]...)
    83  		return buf
    84  	}
    85  	switch r {
    86  	case '\a':
    87  		buf = append(buf, `\a`...)
    88  	case '\b':
    89  		buf = append(buf, `\b`...)
    90  	case '\f':
    91  		buf = append(buf, `\f`...)
    92  	case '\n':
    93  		buf = append(buf, `\n`...)
    94  	case '\r':
    95  		buf = append(buf, `\r`...)
    96  	case '\t':
    97  		buf = append(buf, `\t`...)
    98  	case '\v':
    99  		buf = append(buf, `\v`...)
   100  	default:
   101  		switch {
   102  		case r < ' ' || r == 0x7f:
   103  			buf = append(buf, `\x`...)
   104  			buf = append(buf, lowerhex[byte(r)>>4])
   105  			buf = append(buf, lowerhex[byte(r)&0xF])
   106  		case !utf8.ValidRune(r):
   107  			r = 0xFFFD
   108  			fallthrough
   109  		case r < 0x10000:
   110  			buf = append(buf, `\u`...)
   111  			for s := 12; s >= 0; s -= 4 {
   112  				buf = append(buf, lowerhex[r>>uint(s)&0xF])
   113  			}
   114  		default:
   115  			buf = append(buf, `\U`...)
   116  			for s := 28; s >= 0; s -= 4 {
   117  				buf = append(buf, lowerhex[r>>uint(s)&0xF])
   118  			}
   119  		}
   120  	}
   121  	return buf
   122  }
   123  
   124  // Quote returns a double-quoted Go string literal representing s. The
   125  // returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
   126  // control characters and non-printable characters as defined by
   127  // IsPrint.
   128  func Quote(s string) string {
   129  	return quoteWith(s, '"', false, false)
   130  }
   131  
   132  // AppendQuote appends a double-quoted Go string literal representing s,
   133  // as generated by Quote, to dst and returns the extended buffer.
   134  func AppendQuote(dst []byte, s string) []byte {
   135  	return appendQuotedWith(dst, s, '"', false, false)
   136  }
   137  
   138  // QuoteToASCII returns a double-quoted Go string literal representing s.
   139  // The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
   140  // non-ASCII characters and non-printable characters as defined by IsPrint.
   141  func QuoteToASCII(s string) string {
   142  	return quoteWith(s, '"', true, false)
   143  }
   144  
   145  // AppendQuoteToASCII appends a double-quoted Go string literal representing s,
   146  // as generated by QuoteToASCII, to dst and returns the extended buffer.
   147  func AppendQuoteToASCII(dst []byte, s string) []byte {
   148  	return appendQuotedWith(dst, s, '"', true, false)
   149  }
   150  
   151  // QuoteToGraphic returns a double-quoted Go string literal representing s.
   152  // The returned string leaves Unicode graphic characters, as defined by
   153  // IsGraphic, unchanged and uses Go escape sequences (\t, \n, \xFF, \u0100)
   154  // for non-graphic characters.
   155  func QuoteToGraphic(s string) string {
   156  	return quoteWith(s, '"', false, true)
   157  }
   158  
   159  // AppendQuoteToGraphic appends a double-quoted Go string literal representing s,
   160  // as generated by QuoteToGraphic, to dst and returns the extended buffer.
   161  func AppendQuoteToGraphic(dst []byte, s string) []byte {
   162  	return appendQuotedWith(dst, s, '"', false, true)
   163  }
   164  
   165  // QuoteRune returns a single-quoted Go character literal representing the
   166  // rune. The returned string uses Go escape sequences (\t, \n, \xFF, \u0100)
   167  // for control characters and non-printable characters as defined by IsPrint.
   168  // If r is not a valid Unicode code point, it is interpreted as the Unicode
   169  // replacement character U+FFFD.
   170  func QuoteRune(r rune) string {
   171  	return quoteRuneWith(r, '\'', false, false)
   172  }
   173  
   174  // AppendQuoteRune appends a single-quoted Go character literal representing the rune,
   175  // as generated by QuoteRune, to dst and returns the extended buffer.
   176  func AppendQuoteRune(dst []byte, r rune) []byte {
   177  	return appendQuotedRuneWith(dst, r, '\'', false, false)
   178  }
   179  
   180  // QuoteRuneToASCII returns a single-quoted Go character literal representing
   181  // the rune. The returned string uses Go escape sequences (\t, \n, \xFF,
   182  // \u0100) for non-ASCII characters and non-printable characters as defined
   183  // by IsPrint.
   184  // If r is not a valid Unicode code point, it is interpreted as the Unicode
   185  // replacement character U+FFFD.
   186  func QuoteRuneToASCII(r rune) string {
   187  	return quoteRuneWith(r, '\'', true, false)
   188  }
   189  
   190  // AppendQuoteRuneToASCII appends a single-quoted Go character literal representing the rune,
   191  // as generated by QuoteRuneToASCII, to dst and returns the extended buffer.
   192  func AppendQuoteRuneToASCII(dst []byte, r rune) []byte {
   193  	return appendQuotedRuneWith(dst, r, '\'', true, false)
   194  }
   195  
   196  // QuoteRuneToGraphic returns a single-quoted Go character literal representing
   197  // the rune. If the rune is not a Unicode graphic character,
   198  // as defined by IsGraphic, the returned string will use a Go escape sequence
   199  // (\t, \n, \xFF, \u0100).
   200  // If r is not a valid Unicode code point, it is interpreted as the Unicode
   201  // replacement character U+FFFD.
   202  func QuoteRuneToGraphic(r rune) string {
   203  	return quoteRuneWith(r, '\'', false, true)
   204  }
   205  
   206  // AppendQuoteRuneToGraphic appends a single-quoted Go character literal representing the rune,
   207  // as generated by QuoteRuneToGraphic, to dst and returns the extended buffer.
   208  func AppendQuoteRuneToGraphic(dst []byte, r rune) []byte {
   209  	return appendQuotedRuneWith(dst, r, '\'', false, true)
   210  }
   211  
   212  // CanBackquote reports whether the string s can be represented
   213  // unchanged as a single-line backquoted string without control
   214  // characters other than tab.
   215  func CanBackquote(s string) bool {
   216  	for len(s) > 0 {
   217  		r, wid := utf8.DecodeRuneInString(s)
   218  		s = s[wid:]
   219  		if wid > 1 {
   220  			if r == '\ufeff' {
   221  				return false // BOMs are invisible and should not be quoted.
   222  			}
   223  			continue // All other multibyte runes are correctly encoded and assumed printable.
   224  		}
   225  		if r == utf8.RuneError {
   226  			return false
   227  		}
   228  		if (r < ' ' && r != '\t') || r == '`' || r == '\u007F' {
   229  			return false
   230  		}
   231  	}
   232  	return true
   233  }
   234  
   235  func unhex(b byte) (v rune, ok bool) {
   236  	c := rune(b)
   237  	switch {
   238  	case '0' <= c && c <= '9':
   239  		return c - '0', true
   240  	case 'a' <= c && c <= 'f':
   241  		return c - 'a' + 10, true
   242  	case 'A' <= c && c <= 'F':
   243  		return c - 'A' + 10, true
   244  	}
   245  	return
   246  }
   247  
   248  // UnquoteChar decodes the first character or byte in the escaped string
   249  // or character literal represented by the string s.
   250  // It returns four values:
   251  //
   252  //  1. value, the decoded Unicode code point or byte value;
   253  //  2. multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation;
   254  //  3. tail, the remainder of the string after the character; and
   255  //  4. an error that will be nil if the character is syntactically valid.
   256  //
   257  // The second argument, quote, specifies the type of literal being parsed
   258  // and therefore which escaped quote character is permitted.
   259  // If set to a single quote, it permits the sequence \' and disallows unescaped '.
   260  // If set to a double quote, it permits \" and disallows unescaped ".
   261  // If set to zero, it does not permit either escape and allows both quote characters to appear unescaped.
   262  func UnquoteChar(s string, quote byte) (value rune, multibyte bool, tail string, err error) {
   263  	// easy cases
   264  	if len(s) == 0 {
   265  		err = ErrSyntax
   266  		return
   267  	}
   268  	switch c := s[0]; {
   269  	case c == quote && (quote == '\'' || quote == '"'):
   270  		err = ErrSyntax
   271  		return
   272  	case c >= utf8.RuneSelf:
   273  		r, size := utf8.DecodeRuneInString(s)
   274  		return r, true, s[size:], nil
   275  	case c != '\\':
   276  		return rune(s[0]), false, s[1:], nil
   277  	}
   278  
   279  	// hard case: c is backslash
   280  	if len(s) <= 1 {
   281  		err = ErrSyntax
   282  		return
   283  	}
   284  	c := s[1]
   285  	s = s[2:]
   286  
   287  	switch c {
   288  	case 'a':
   289  		value = '\a'
   290  	case 'b':
   291  		value = '\b'
   292  	case 'f':
   293  		value = '\f'
   294  	case 'n':
   295  		value = '\n'
   296  	case 'r':
   297  		value = '\r'
   298  	case 't':
   299  		value = '\t'
   300  	case 'v':
   301  		value = '\v'
   302  	case 'x', 'u', 'U':
   303  		n := 0
   304  		switch c {
   305  		case 'x':
   306  			n = 2
   307  		case 'u':
   308  			n = 4
   309  		case 'U':
   310  			n = 8
   311  		}
   312  		var v rune
   313  		if len(s) < n {
   314  			err = ErrSyntax
   315  			return
   316  		}
   317  		for j := 0; j < n; j++ {
   318  			x, ok := unhex(s[j])
   319  			if !ok {
   320  				err = ErrSyntax
   321  				return
   322  			}
   323  			v = v<<4 | x
   324  		}
   325  		s = s[n:]
   326  		if c == 'x' {
   327  			// single-byte string, possibly not UTF-8
   328  			value = v
   329  			break
   330  		}
   331  		if !utf8.ValidRune(v) {
   332  			err = ErrSyntax
   333  			return
   334  		}
   335  		value = v
   336  		multibyte = true
   337  	case '0', '1', '2', '3', '4', '5', '6', '7':
   338  		v := rune(c) - '0'
   339  		if len(s) < 2 {
   340  			err = ErrSyntax
   341  			return
   342  		}
   343  		for j := 0; j < 2; j++ { // one digit already; two more
   344  			x := rune(s[j]) - '0'
   345  			if x < 0 || x > 7 {
   346  				err = ErrSyntax
   347  				return
   348  			}
   349  			v = (v << 3) | x
   350  		}
   351  		s = s[2:]
   352  		if v > 255 {
   353  			err = ErrSyntax
   354  			return
   355  		}
   356  		value = v
   357  	case '\\':
   358  		value = '\\'
   359  	case '\'', '"':
   360  		if c != quote {
   361  			err = ErrSyntax
   362  			return
   363  		}
   364  		value = rune(c)
   365  	default:
   366  		err = ErrSyntax
   367  		return
   368  	}
   369  	tail = s
   370  	return
   371  }
   372  
   373  // QuotedPrefix returns the quoted string (as understood by Unquote) at the prefix of s.
   374  // If s does not start with a valid quoted string, QuotedPrefix returns an error.
   375  func QuotedPrefix(s string) (string, error) {
   376  	out, _, err := unquote(s, false)
   377  	return out, err
   378  }
   379  
   380  // Unquote interprets s as a single-quoted, double-quoted,
   381  // or backquoted Go string literal, returning the string value
   382  // that s quotes.  (If s is single-quoted, it would be a Go
   383  // character literal; Unquote returns the corresponding
   384  // one-character string.)
   385  func Unquote(s string) (string, error) {
   386  	out, rem, err := unquote(s, true)
   387  	if len(rem) > 0 {
   388  		return "", ErrSyntax
   389  	}
   390  	return out, err
   391  }
   392  
   393  // unquote parses a quoted string at the start of the input,
   394  // returning the parsed prefix, the remaining suffix, and any parse errors.
   395  // If unescape is true, the parsed prefix is unescaped,
   396  // otherwise the input prefix is provided verbatim.
   397  func unquote(in string, unescape bool) (out, rem string, err error) {
   398  	// Determine the quote form and optimistically find the terminating quote.
   399  	if len(in) < 2 {
   400  		return "", in, ErrSyntax
   401  	}
   402  	quote := in[0]
   403  	end := index(in[1:], quote)
   404  	if end < 0 {
   405  		return "", in, ErrSyntax
   406  	}
   407  	end += 2 // position after terminating quote; may be wrong if escape sequences are present
   408  
   409  	switch quote {
   410  	case '`':
   411  		switch {
   412  		case !unescape:
   413  			out = in[:end] // include quotes
   414  		case !contains(in[:end], '\r'):
   415  			out = in[len("`") : end-len("`")] // exclude quotes
   416  		default:
   417  			// Carriage return characters ('\r') inside raw string literals
   418  			// are discarded from the raw string value.
   419  			buf := make([]byte, 0, end-len("`")-len("\r")-len("`"))
   420  			for i := len("`"); i < end-len("`"); i++ {
   421  				if in[i] != '\r' {
   422  					buf = append(buf, in[i])
   423  				}
   424  			}
   425  			out = string(buf)
   426  		}
   427  		// NOTE: Prior implementations did not verify that raw strings consist
   428  		// of valid UTF-8 characters and we continue to not verify it as such.
   429  		// The Go specification does not explicitly require valid UTF-8,
   430  		// but only mention that it is implicitly valid for Go source code
   431  		// (which must be valid UTF-8).
   432  		return out, in[end:], nil
   433  	case '"', '\'':
   434  		// Handle quoted strings without any escape sequences.
   435  		if !contains(in[:end], '\\') && !contains(in[:end], '\n') {
   436  			var valid bool
   437  			switch quote {
   438  			case '"':
   439  				valid = utf8.ValidString(in[len(`"`) : end-len(`"`)])
   440  			case '\'':
   441  				r, n := utf8.DecodeRuneInString(in[len("'") : end-len("'")])
   442  				valid = len("'")+n+len("'") == end && (r != utf8.RuneError || n != 1)
   443  			}
   444  			if valid {
   445  				out = in[:end]
   446  				if unescape {
   447  					out = out[1 : end-1] // exclude quotes
   448  				}
   449  				return out, in[end:], nil
   450  			}
   451  		}
   452  
   453  		// Handle quoted strings with escape sequences.
   454  		var buf []byte
   455  		in0 := in
   456  		in = in[1:] // skip starting quote
   457  		if unescape {
   458  			buf = make([]byte, 0, 3*end/2) // try to avoid more allocations
   459  		}
   460  		for len(in) > 0 && in[0] != quote {
   461  			// Process the next character,
   462  			// rejecting any unescaped newline characters which are invalid.
   463  			r, multibyte, rem, err := UnquoteChar(in, quote)
   464  			if in[0] == '\n' || err != nil {
   465  				return "", in0, ErrSyntax
   466  			}
   467  			in = rem
   468  
   469  			// Append the character if unescaping the input.
   470  			if unescape {
   471  				if r < utf8.RuneSelf || !multibyte {
   472  					buf = append(buf, byte(r))
   473  				} else {
   474  					var arr [utf8.UTFMax]byte
   475  					n := utf8.EncodeRune(arr[:], r)
   476  					buf = append(buf, arr[:n]...)
   477  				}
   478  			}
   479  
   480  			// Single quoted strings must be a single character.
   481  			if quote == '\'' {
   482  				break
   483  			}
   484  		}
   485  
   486  		// Verify that the string ends with a terminating quote.
   487  		if !(len(in) > 0 && in[0] == quote) {
   488  			return "", in0, ErrSyntax
   489  		}
   490  		in = in[1:] // skip terminating quote
   491  
   492  		if unescape {
   493  			return string(buf), in, nil
   494  		}
   495  		return in0[:len(in0)-len(in)], in, nil
   496  	default:
   497  		return "", in, ErrSyntax
   498  	}
   499  }
   500  
   501  // bsearch16 returns the smallest i such that a[i] >= x.
   502  // If there is no such i, bsearch16 returns len(a).
   503  func bsearch16(a []uint16, x uint16) int {
   504  	i, j := 0, len(a)
   505  	for i < j {
   506  		h := i + (j-i)>>1
   507  		if a[h] < x {
   508  			i = h + 1
   509  		} else {
   510  			j = h
   511  		}
   512  	}
   513  	return i
   514  }
   515  
   516  // bsearch32 returns the smallest i such that a[i] >= x.
   517  // If there is no such i, bsearch32 returns len(a).
   518  func bsearch32(a []uint32, x uint32) int {
   519  	i, j := 0, len(a)
   520  	for i < j {
   521  		h := i + (j-i)>>1
   522  		if a[h] < x {
   523  			i = h + 1
   524  		} else {
   525  			j = h
   526  		}
   527  	}
   528  	return i
   529  }
   530  
   531  // TODO: IsPrint is a local implementation of unicode.IsPrint, verified by the tests
   532  // to give the same answer. It allows this package not to depend on unicode,
   533  // and therefore not pull in all the Unicode tables. If the linker were better
   534  // at tossing unused tables, we could get rid of this implementation.
   535  // That would be nice.
   536  
   537  // IsPrint reports whether the rune is defined as printable by Go, with
   538  // the same definition as unicode.IsPrint: letters, numbers, punctuation,
   539  // symbols and ASCII space.
   540  func IsPrint(r rune) bool {
   541  	// Fast check for Latin-1
   542  	if r <= 0xFF {
   543  		if 0x20 <= r && r <= 0x7E {
   544  			// All the ASCII is printable from space through DEL-1.
   545  			return true
   546  		}
   547  		if 0xA1 <= r && r <= 0xFF {
   548  			// Similarly for ¡ through ÿ...
   549  			return r != 0xAD // ...except for the bizarre soft hyphen.
   550  		}
   551  		return false
   552  	}
   553  
   554  	// Same algorithm, either on uint16 or uint32 value.
   555  	// First, find first i such that isPrint[i] >= x.
   556  	// This is the index of either the start or end of a pair that might span x.
   557  	// The start is even (isPrint[i&^1]) and the end is odd (isPrint[i|1]).
   558  	// If we find x in a range, make sure x is not in isNotPrint list.
   559  
   560  	if 0 <= r && r < 1<<16 {
   561  		rr, isPrint, isNotPrint := uint16(r), isPrint16, isNotPrint16
   562  		i := bsearch16(isPrint, rr)
   563  		if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
   564  			return false
   565  		}
   566  		j := bsearch16(isNotPrint, rr)
   567  		return j >= len(isNotPrint) || isNotPrint[j] != rr
   568  	}
   569  
   570  	rr, isPrint, isNotPrint := uint32(r), isPrint32, isNotPrint32
   571  	i := bsearch32(isPrint, rr)
   572  	if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
   573  		return false
   574  	}
   575  	if r >= 0x20000 {
   576  		return true
   577  	}
   578  	r -= 0x10000
   579  	j := bsearch16(isNotPrint, uint16(r))
   580  	return j >= len(isNotPrint) || isNotPrint[j] != uint16(r)
   581  }
   582  
   583  // IsGraphic reports whether the rune is defined as a Graphic by Unicode. Such
   584  // characters include letters, marks, numbers, punctuation, symbols, and
   585  // spaces, from categories L, M, N, P, S, and Zs.
   586  func IsGraphic(r rune) bool {
   587  	if IsPrint(r) {
   588  		return true
   589  	}
   590  	return isInGraphicList(r)
   591  }
   592  
   593  // isInGraphicList reports whether the rune is in the isGraphic list. This separation
   594  // from IsGraphic allows quoteWith to avoid two calls to IsPrint.
   595  // Should be called only if IsPrint fails.
   596  func isInGraphicList(r rune) bool {
   597  	// We know r must fit in 16 bits - see makeisprint.go.
   598  	if r > 0xFFFF {
   599  		return false
   600  	}
   601  	rr := uint16(r)
   602  	i := bsearch16(isGraphic, rr)
   603  	return i < len(isGraphic) && rr == isGraphic[i]
   604  }
   605  

View as plain text