cmd/go: ignore UTF8 BOM when reading source code

Fix the problem that UTF8 BOM can cause the parsing of import path and directives to fail. Fixes #46198 Fixes #46290 Fixes #35726 Change-Id: I2d9995ee82b094bcfa5583f0cb4e8547cb973077 GitHub-Last-Rev: 98abf91377 GitHub-Pull-Request: golang/go#46643 Reviewed-on: https://go-review.googlesource.com/c/go/+/325990 Reviewed-by: Bryan C. Mills <bcmills@google.com> Trust: Bryan C. Mills <bcmills@google.com> Trust: Robert Findley <rfindley@google.com> Run-TryBot: Bryan C. Mills <bcmills@google.com> TryBot-Result: Go Bot <gobot@golang.org>
2024-09-15 22:20:06 +00:00 · 2021-06-15 17:11:05 +00:00 · 2021-06-15 17:11:05 +00:00 · 219fe9d547
parent 723f199edd
commit 219fe9d547
5 changed files with 117 additions and 3 deletions
--- a/src/cmd/go/internal/imports/read.go
+++ b/src/cmd/go/internal/imports/read.go
@ -8,6 +8,7 @@ package imports

 import (
 	"bufio"
+	"bytes"
 	"errors"
 	"io"
 	"unicode/utf8"
@ -22,6 +23,19 @@ type importReader struct {
 	nerr int
 }

+var bom = []byte{0xef, 0xbb, 0xbf}
+
+func newImportReader(b *bufio.Reader) *importReader {
+	// Remove leading UTF-8 BOM.
+	// Per https://golang.org/ref/spec#Source_code_representation:
+	// a compiler may ignore a UTF-8-encoded byte order mark (U+FEFF)
+	// if it is the first Unicode code point in the source text.
+	if leadingBytes, err := b.Peek(3); err == nil && bytes.Equal(leadingBytes, bom) {
+		b.Discard(3)
+	}
+	return &importReader{b: b}
+}
+
 func isIdent(c byte) bool {
 	return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' || '0' <= c && c <= '9' || c == '_' || c >= utf8.RuneSelf
 }
@ -201,7 +215,7 @@ func (r *importReader) readImport(imports *[]string) {
 // ReadComments is like io.ReadAll, except that it only reads the leading
 // block of comments in the file.
 func ReadComments(f io.Reader) ([]byte, error) {
-	r := &importReader{b: bufio.NewReader(f)}
+	r := newImportReader(bufio.NewReader(f))
 	r.peekByte(true)
 	if r.err == nil && !r.eof {
 		// Didn't reach EOF, so must have found a non-space byte. Remove it.
@ -213,7 +227,7 @@ func ReadComments(f io.Reader) ([]byte, error) {
 // ReadImports is like io.ReadAll, except that it expects a Go file as input
 // and stops reading the input once the imports have completed.
 func ReadImports(f io.Reader, reportSyntaxError bool, imports *[]string) ([]byte, error) {
-	r := &importReader{b: bufio.NewReader(f)}
+	r := newImportReader(bufio.NewReader(f))

 	r.readKeyword("package")
 	r.readIdent()
--- a/src/cmd/go/internal/imports/read_test.go
+++ b/src/cmd/go/internal/imports/read_test.go
@ -66,6 +66,10 @@ var readImportsTests = []readTest{
 		`,
 		"",
 	},
+	{
+		"\ufeff𝔻" + `package p; import "x";ℙvar x = 1`,
+		"",
+	},
 }

 var readCommentsTests = []readTest{
@ -81,6 +85,10 @@ var readCommentsTests = []readTest{
 		`ℙpackage p; import . "x"`,
 		"",
 	},
+	{
+		"\ufeff𝔻" + `ℙpackage p; import . "x"`,
+		"",
+	},
 	{
 		`// foo

@ -90,6 +98,19 @@ var readCommentsTests = []readTest{
 		
 		/*/ zot */

+		// asdf
+		ℙHello, world`,
+		"",
+	},
+	{
+		"\ufeff𝔻" + `// foo
+
+		/* bar */
+
+		/* quux */ // baz
+
+		/*/ zot */
+
 		// asdf
 		ℙHello, world`,
 		"",
@ -107,6 +128,11 @@ func testRead(t *testing.T, tests []readTest, read func(io.Reader) ([]byte, erro
 			in = tt.in[:j] + tt.in[j+len("ℙ"):]
 			testOut = tt.in[:j]
 		}
+		d := strings.Index(tt.in, "𝔻")
+		if d >= 0 {
+			in = in[:d] + in[d+len("𝔻"):]
+			testOut = testOut[d+len("𝔻"):]
+		}
 		r := strings.NewReader(in)
 		buf, err := read(r)
 		if err != nil {
--- a/src/cmd/go/testdata/script/build_ignore_leading_bom.txt
+++ b/src/cmd/go/testdata/script/build_ignore_leading_bom.txt
@ -0,0 +1,27 @@
+# Per https://golang.org/ref/spec#Source_code_representation:
+# a compiler may ignore a UTF-8-encoded byte order mark (U+FEFF)
+# if it is the first Unicode code point in the source text.
+
+go list -f 'Imports: {{.Imports}} EmbedFiles: {{.EmbedFiles}}' .
+stdout '^Imports: \[embed m/hello\] EmbedFiles: \[.*file\]$'
+
+-- go.mod --
+module m
+
+go 1.16
+-- m.go --
+package main
+
+import (
+	_ "embed"
+
+	"m/hello"
+)
+
+//go:embed file
+var s string
+
+-- hello/hello.go --
+package hello
+
+-- file --
--- a/src/go/build/read.go
+++ b/src/go/build/read.go
@ -6,6 +6,7 @@ package build

 import (
 	"bufio"
+	"bytes"
 	"errors"
 	"fmt"
 	"go/ast"
@ -28,9 +29,19 @@ type importReader struct {
 	pos  token.Position
 }

+var bom = []byte{0xef, 0xbb, 0xbf}
+
 func newImportReader(name string, r io.Reader) *importReader {
+	b := bufio.NewReader(r)
+	// Remove leading UTF-8 BOM.
+	// Per https://golang.org/ref/spec#Source_code_representation:
+	// a compiler may ignore a UTF-8-encoded byte order mark (U+FEFF)
+	// if it is the first Unicode code point in the source text.
+	if leadingBytes, err := b.Peek(3); err == nil && bytes.Equal(leadingBytes, bom) {
+		b.Discard(3)
+	}
 	return &importReader{
-		b: bufio.NewReader(r),
+		b: b,
 		pos: token.Position{
 			Filename: name,
 			Line:     1,
--- a/src/go/build/read_test.go
+++ b/src/go/build/read_test.go
@ -66,6 +66,10 @@ var readGoInfoTests = []readTest{
 		`,
 		"",
 	},
+	{
+		"\ufeff𝔻" + `package p; import "x";ℙvar x = 1`,
+		"",
+	},
 }

 var readCommentsTests = []readTest{
@ -81,6 +85,10 @@ var readCommentsTests = []readTest{
 		`ℙpackage p; import . "x"`,
 		"",
 	},
+	{
+		"\ufeff𝔻" + `ℙpackage p; import . "x"`,
+		"",
+	},
 	{
 		`// foo

@ -90,6 +98,19 @@ var readCommentsTests = []readTest{

 		/*/ zot */

+		// asdf
+		ℙHello, world`,
+		"",
+	},
+	{
+		"\ufeff𝔻" + `// foo
+
+		/* bar */
+
+		/* quux */ // baz
+
+		/*/ zot */
+
 		// asdf
 		ℙHello, world`,
 		"",
@ -107,6 +128,11 @@ func testRead(t *testing.T, tests []readTest, read func(io.Reader) ([]byte, erro
 			in = tt.in[:j] + tt.in[j+len("ℙ"):]
 			testOut = tt.in[:j]
 		}
+		d := strings.Index(tt.in, "𝔻")
+		if d >= 0 {
+			in = in[:d] + in[d+len("𝔻"):]
+			testOut = testOut[d+len("𝔻"):]
+		}
 		r := strings.NewReader(in)
 		buf, err := read(r)
 		if err != nil {
@ -264,6 +290,12 @@ var readEmbedTests = []struct {
 		 test:3:14:y
 		 test:3:16:z`,
 	},
+	{
+		"\ufeffpackage p\nimport \"embed\"\n//go:embed x y z\nvar files embed.FS",
+		`test:3:12:x
+		 test:3:14:y
+		 test:3:16:z`,
+	},
 	{
 		"package p\nimport \"embed\"\nvar s = \"/*\"\n//go:embed x\nvar files embed.FS",
 		`test:4:12:x`,
@ -292,6 +324,10 @@ var readEmbedTests = []struct {
 		"package p\n//go:embed x y z\nvar files embed.FS", // no import, no scan
 		"",
 	},
+	{
+		"\ufeffpackage p\n//go:embed x y z\nvar files embed.FS", // no import, no scan
+		"",
+	},
 }

 func TestReadEmbed(t *testing.T) {