hclwrite/parser.go - hashicorp/hcl/v2 - Git at Google

 package hclwrite

 import (
 	"fmt"
 	"sort"

 	"github.com/hashicorp/hcl/v2"
 	"github.com/hashicorp/hcl/v2/hclsyntax"
 	"github.com/zclconf/go-cty/cty"
 )

 // Our "parser" here is actually not doing any parsing of its own. Instead,
 // it leans on the native parser in hclsyntax, and then uses the source ranges
 // from the AST to partition the raw token sequence to match the raw tokens
 // up to AST nodes.
 //
 // This strategy feels somewhat counter-intuitive, since most of the work the
 // parser does is thrown away here, but this strategy is chosen because the
 // normal parsing work done by hclsyntax is considered to be the "main case",
 // while modifying and re-printing source is more of an edge case, used only
 // in ancillary tools, and so it's good to keep all the main parsing logic
 // with the main case but keep all of the extra complexity of token wrangling
 // out of the main parser, which is already rather complex just serving the
 // use-cases it already serves.
 //
 // If the parsing step produces any errors, the returned File is nil because
 // we can't reliably extract tokens from the partial AST produced by an
 // erroneous parse.
 func parse(src []byte, filename string, start hcl.Pos) (*File, hcl.Diagnostics) {
 	file, diags := hclsyntax.ParseConfig(src, filename, start)
 	if diags.HasErrors() {
 		return nil, diags
 	}

 	// To do our work here, we use the "native" tokens (those from hclsyntax)
 	// to match against source ranges in the AST, but ultimately produce
 	// slices from our sequence of "writer" tokens, which contain only
 	// *relative* position information that is more appropriate for
 	// transformation/writing use-cases.
 	nativeTokens, diags := hclsyntax.LexConfig(src, filename, start)
 	if diags.HasErrors() {
 		// should never happen, since we would've caught these diags in
 		// the first call above.
 		return nil, diags
 	}
 	writerTokens := writerTokens(nativeTokens)

 	from := inputTokens{
 		nativeTokens: nativeTokens,
 		writerTokens: writerTokens,
 	}

 	before, root, after := parseBody(file.Body.(*hclsyntax.Body), from)
 	ret := &File{
 		inTree: newInTree(),

 		srcBytes: src,
 		body:     root,
 	}

 	nodes := ret.inTree.children
 	nodes.Append(before.Tokens())
 	nodes.AppendNode(root)
 	nodes.Append(after.Tokens())

 	return ret, diags
 }

 type inputTokens struct {
 	nativeTokens hclsyntax.Tokens
 	writerTokens Tokens
 }

 func (it inputTokens) Partition(rng hcl.Range) (before, within, after inputTokens) {
 	start, end := partitionTokens(it.nativeTokens, rng)
 	before = it.Slice(0, start)
 	within = it.Slice(start, end)
 	after = it.Slice(end, len(it.nativeTokens))
 	return
 }

 func (it inputTokens) PartitionType(ty hclsyntax.TokenType) (before, within, after inputTokens) {
 	for i, t := range it.writerTokens {
 		if t.Type == ty {
 			return it.Slice(0, i), it.Slice(i, i+1), it.Slice(i+1, len(it.nativeTokens))
 		}
 	}
 	panic(fmt.Sprintf("didn't find any token of type %s", ty))
 }

 func (it inputTokens) PartitionTypeOk(ty hclsyntax.TokenType) (before, within, after inputTokens, ok bool) {
 	for i, t := range it.writerTokens {
 		if t.Type == ty {
 			return it.Slice(0, i), it.Slice(i, i+1), it.Slice(i+1, len(it.nativeTokens)), true
 		}
 	}

 	return inputTokens{}, inputTokens{}, inputTokens{}, false
 }

 func (it inputTokens) PartitionTypeSingle(ty hclsyntax.TokenType) (before inputTokens, found *Token, after inputTokens) {
 	before, within, after := it.PartitionType(ty)
 	if within.Len() != 1 {
 		panic("PartitionType found more than one token")
 	}
 	return before, within.Tokens()[0], after
 }

 // PartitionIncludeComments is like Partition except the returned "within"
 // range includes any lead and line comments associated with the range.
 func (it inputTokens) PartitionIncludingComments(rng hcl.Range) (before, within, after inputTokens) {
 	start, end := partitionTokens(it.nativeTokens, rng)
 	start = partitionLeadCommentTokens(it.nativeTokens[:start])
 	_, afterNewline := partitionLineEndTokens(it.nativeTokens[end:])
 	end += afterNewline

 	before = it.Slice(0, start)
 	within = it.Slice(start, end)
 	after = it.Slice(end, len(it.nativeTokens))
 	return

 }

 // PartitionBlockItem is similar to PartitionIncludeComments but it returns
 // the comments as separate token sequences so that they can be captured into
 // AST attributes. It makes assumptions that apply only to block items, so
 // should not be used for other constructs.
 func (it inputTokens) PartitionBlockItem(rng hcl.Range) (before, leadComments, within, lineComments, newline, after inputTokens) {
 	before, within, after = it.Partition(rng)
 	before, leadComments = before.PartitionLeadComments()
 	lineComments, newline, after = after.PartitionLineEndTokens()
 	return
 }

 func (it inputTokens) PartitionLeadComments() (before, within inputTokens) {
 	start := partitionLeadCommentTokens(it.nativeTokens)
 	before = it.Slice(0, start)
 	within = it.Slice(start, len(it.nativeTokens))
 	return
 }

 func (it inputTokens) PartitionLineEndTokens() (comments, newline, after inputTokens) {
 	afterComments, afterNewline := partitionLineEndTokens(it.nativeTokens)
 	comments = it.Slice(0, afterComments)
 	newline = it.Slice(afterComments, afterNewline)
 	after = it.Slice(afterNewline, len(it.nativeTokens))
 	return
 }

 func (it inputTokens) Slice(start, end int) inputTokens {
 	// When we slice, we create a new slice with no additional capacity because
 	// we expect that these slices will be mutated in order to insert
 	// new code into the AST, and we want to ensure that a new underlying
 	// array gets allocated in that case, rather than writing into some
 	// following slice and corrupting it.
 	return inputTokens{
 		nativeTokens: it.nativeTokens[start:end:end],
 		writerTokens: it.writerTokens[start:end:end],
 	}
 }

 func (it inputTokens) Len() int {
 	return len(it.nativeTokens)
 }

 func (it inputTokens) Tokens() Tokens {
 	return it.writerTokens
 }

 func (it inputTokens) Types() []hclsyntax.TokenType {
 	ret := make([]hclsyntax.TokenType, len(it.nativeTokens))
 	for i, tok := range it.nativeTokens {
 		ret[i] = tok.Type
 	}
 	return ret
 }

 // parseBody locates the given body within the given input tokens and returns
 // the resulting *Body object as well as the tokens that appeared before and
 // after it.
 func parseBody(nativeBody *hclsyntax.Body, from inputTokens) (inputTokens, *node, inputTokens) {
 	before, within, after := from.PartitionIncludingComments(nativeBody.SrcRange)

 	// The main AST doesn't retain the original source ordering of the
 	// body items, so we need to reconstruct that ordering by inspecting
 	// their source ranges.
 	nativeItems := make([]hclsyntax.Node, 0, len(nativeBody.Attributes)+len(nativeBody.Blocks))
 	for _, nativeAttr := range nativeBody.Attributes {
 		nativeItems = append(nativeItems, nativeAttr)
 	}
 	for _, nativeBlock := range nativeBody.Blocks {
 		nativeItems = append(nativeItems, nativeBlock)
 	}
 	sort.Sort(nativeNodeSorter{nativeItems})

 	body := &Body{
 		inTree: newInTree(),
 		items:  newNodeSet(),
 	}

 	remain := within
 	for _, nativeItem := range nativeItems {
 		beforeItem, item, afterItem := parseBodyItem(nativeItem, remain)

 		if beforeItem.Len() > 0 {
 			body.AppendUnstructuredTokens(beforeItem.Tokens())
 		}
 		body.appendItemNode(item)

 		remain = afterItem
 	}

 	if remain.Len() > 0 {
 		body.AppendUnstructuredTokens(remain.Tokens())
 	}

 	return before, newNode(body), after
 }

 func parseBodyItem(nativeItem hclsyntax.Node, from inputTokens) (inputTokens, *node, inputTokens) {
 	before, leadComments, within, lineComments, newline, after := from.PartitionBlockItem(nativeItem.Range())

 	var item *node

 	switch tItem := nativeItem.(type) {
 	case *hclsyntax.Attribute:
 		item = parseAttribute(tItem, within, leadComments, lineComments, newline)
 	case *hclsyntax.Block:
 		item = parseBlock(tItem, within, leadComments, lineComments, newline)
 	default:
 		// should never happen if caller is behaving
 		panic("unsupported native item type")
 	}

 	return before, item, after
 }

 func parseAttribute(nativeAttr *hclsyntax.Attribute, from, leadComments, lineComments, newline inputTokens) *node {
 	attr := &Attribute{
 		inTree: newInTree(),
 	}
 	children := attr.inTree.children

 	{
 		cn := newNode(newComments(leadComments.Tokens()))
 		attr.leadComments = cn
 		children.AppendNode(cn)
 	}

 	before, nameTokens, from := from.Partition(nativeAttr.NameRange)
 	{
 		children.AppendUnstructuredTokens(before.Tokens())
 		if nameTokens.Len() != 1 {
 			// Should never happen with valid input
 			panic("attribute name is not exactly one token")
 		}
 		token := nameTokens.Tokens()[0]
 		in := newNode(newIdentifier(token))
 		attr.name = in
 		children.AppendNode(in)
 	}

 	before, equalsTokens, from := from.Partition(nativeAttr.EqualsRange)
 	children.AppendUnstructuredTokens(before.Tokens())
 	children.AppendUnstructuredTokens(equalsTokens.Tokens())

 	before, exprTokens, from := from.Partition(nativeAttr.Expr.Range())
 	{
 		children.AppendUnstructuredTokens(before.Tokens())
 		exprNode := parseExpression(nativeAttr.Expr, exprTokens)
 		attr.expr = exprNode
 		children.AppendNode(exprNode)
 	}

 	{
 		cn := newNode(newComments(lineComments.Tokens()))
 		attr.lineComments = cn
 		children.AppendNode(cn)
 	}

 	children.AppendUnstructuredTokens(newline.Tokens())

 	// Collect any stragglers, though there shouldn't be any
 	children.AppendUnstructuredTokens(from.Tokens())

 	return newNode(attr)
 }

 func parseBlock(nativeBlock *hclsyntax.Block, from, leadComments, lineComments, newline inputTokens) *node {
 	block := &Block{
 		inTree: newInTree(),
 	}
 	children := block.inTree.children

 	{
 		cn := newNode(newComments(leadComments.Tokens()))
 		block.leadComments = cn
 		children.AppendNode(cn)
 	}

 	before, typeTokens, from := from.Partition(nativeBlock.TypeRange)
 	{
 		children.AppendUnstructuredTokens(before.Tokens())
 		if typeTokens.Len() != 1 {
 			// Should never happen with valid input
 			panic("block type name is not exactly one token")
 		}
 		token := typeTokens.Tokens()[0]
 		in := newNode(newIdentifier(token))
 		block.typeName = in
 		children.AppendNode(in)
 	}

 	before, labelsNode, from := parseBlockLabels(nativeBlock, from)
 	block.labels = labelsNode
 	children.AppendNode(labelsNode)

 	before, oBrace, from := from.Partition(nativeBlock.OpenBraceRange)
 	children.AppendUnstructuredTokens(before.Tokens())
 	block.open = children.AppendUnstructuredTokens(oBrace.Tokens())

 	// We go a bit out of order here: we go hunting for the closing brace
 	// so that we have a delimited body, but then we'll deal with the body
 	// before we actually append the closing brace and any straggling tokens
 	// that appear after it.
 	bodyTokens, cBrace, from := from.Partition(nativeBlock.CloseBraceRange)
 	before, body, after := parseBody(nativeBlock.Body, bodyTokens)
 	children.AppendUnstructuredTokens(before.Tokens())
 	block.body = body
 	children.AppendNode(body)
 	children.AppendUnstructuredTokens(after.Tokens())

 	block.close = children.AppendUnstructuredTokens(cBrace.Tokens())

 	// stragglers
 	children.AppendUnstructuredTokens(from.Tokens())
 	if lineComments.Len() > 0 {
 		// blocks don't actually have line comments, so we'll just treat
 		// them as extra stragglers
 		children.AppendUnstructuredTokens(lineComments.Tokens())
 	}
 	children.AppendUnstructuredTokens(newline.Tokens())

 	return newNode(block)
 }

 func parseBlockLabels(nativeBlock *hclsyntax.Block, from inputTokens) (inputTokens, *node, inputTokens) {
 	labelsObj := newBlockLabels(nil)
 	children := labelsObj.children

 	var beforeAll inputTokens
 	for i, rng := range nativeBlock.LabelRanges {
 		var before, labelTokens inputTokens
 		before, labelTokens, from = from.Partition(rng)
 		if i == 0 {
 			beforeAll = before
 		} else {
 			children.AppendUnstructuredTokens(before.Tokens())
 		}
 		tokens := labelTokens.Tokens()
 		var ln *node
 		if len(tokens) == 1 && tokens[0].Type == hclsyntax.TokenIdent {
 			ln = newNode(newIdentifier(tokens[0]))
 		} else {
 			ln = newNode(newQuoted(tokens))
 		}
 		labelsObj.items.Add(ln)
 		children.AppendNode(ln)
 	}

 	after := from
 	return beforeAll, newNode(labelsObj), after
 }

 func parseExpression(nativeExpr hclsyntax.Expression, from inputTokens) *node {
 	expr := newExpression()
 	children := expr.inTree.children

 	nativeVars := nativeExpr.Variables()

 	for _, nativeTraversal := range nativeVars {
 		before, traversal, after := parseTraversal(nativeTraversal, from)
 		children.AppendUnstructuredTokens(before.Tokens())
 		children.AppendNode(traversal)
 		expr.absTraversals.Add(traversal)
 		from = after
 	}
 	// Attach any stragglers that don't belong to a traversal to the expression
 	// itself. In an expression with no traversals at all, this is just the
 	// entirety of "from".
 	children.AppendUnstructuredTokens(from.Tokens())

 	return newNode(expr)
 }

 func parseTraversal(nativeTraversal hcl.Traversal, from inputTokens) (before inputTokens, n *node, after inputTokens) {
 	traversal := newTraversal()
 	children := traversal.inTree.children
 	before, from, after = from.Partition(nativeTraversal.SourceRange())

 	stepAfter := from
 	for _, nativeStep := range nativeTraversal {
 		before, step, after := parseTraversalStep(nativeStep, stepAfter)
 		children.AppendUnstructuredTokens(before.Tokens())
 		children.AppendNode(step)
 		traversal.steps.Add(step)
 		stepAfter = after
 	}

 	return before, newNode(traversal), after
 }

 func parseTraversalStep(nativeStep hcl.Traverser, from inputTokens) (before inputTokens, n *node, after inputTokens) {
 	var children *nodes
 	switch tNativeStep := nativeStep.(type) {

 	case hcl.TraverseRoot, hcl.TraverseAttr:
 		step := newTraverseName()
 		children = step.inTree.children
 		before, from, after = from.Partition(nativeStep.SourceRange())
 		inBefore, token, inAfter := from.PartitionTypeSingle(hclsyntax.TokenIdent)
 		name := newIdentifier(token)
 		children.AppendUnstructuredTokens(inBefore.Tokens())
 		step.name = children.Append(name)
 		children.AppendUnstructuredTokens(inAfter.Tokens())
 		return before, newNode(step), after

 	case hcl.TraverseIndex:
 		step := newTraverseIndex()
 		children = step.inTree.children
 		before, from, after = from.Partition(nativeStep.SourceRange())

 		if inBefore, dot, from, ok := from.PartitionTypeOk(hclsyntax.TokenDot); ok {
 			children.AppendUnstructuredTokens(inBefore.Tokens())
 			children.AppendUnstructuredTokens(dot.Tokens())

 			valBefore, valToken, valAfter := from.PartitionTypeSingle(hclsyntax.TokenNumberLit)
 			children.AppendUnstructuredTokens(valBefore.Tokens())
 			key := newNumber(valToken)
 			step.key = children.Append(key)
 			children.AppendUnstructuredTokens(valAfter.Tokens())

 			return before, newNode(step), after
 		}

 		var inBefore, oBrack, keyTokens, cBrack inputTokens
 		inBefore, oBrack, from = from.PartitionType(hclsyntax.TokenOBrack)
 		children.AppendUnstructuredTokens(inBefore.Tokens())
 		children.AppendUnstructuredTokens(oBrack.Tokens())
 		keyTokens, cBrack, from = from.PartitionType(hclsyntax.TokenCBrack)

 		keyVal := tNativeStep.Key
 		switch keyVal.Type() {
 		case cty.String:
 			key := newQuoted(keyTokens.Tokens())
 			step.key = children.Append(key)
 		case cty.Number:
 			valBefore, valToken, valAfter := keyTokens.PartitionTypeSingle(hclsyntax.TokenNumberLit)
 			children.AppendUnstructuredTokens(valBefore.Tokens())
 			key := newNumber(valToken)
 			step.key = children.Append(key)
 			children.AppendUnstructuredTokens(valAfter.Tokens())
 		}

 		children.AppendUnstructuredTokens(cBrack.Tokens())
 		children.AppendUnstructuredTokens(from.Tokens())

 		return before, newNode(step), after
 	default:
 		panic(fmt.Sprintf("unsupported traversal step type %T", nativeStep))
 	}

 }

 // writerTokens takes a sequence of tokens as produced by the main hclsyntax
 // package and transforms it into an equivalent sequence of tokens using
 // this package's own token model.
 //
 // The resulting list contains the same number of tokens and uses the same
 // indices as the input, allowing the two sets of tokens to be correlated
 // by index.
 func writerTokens(nativeTokens hclsyntax.Tokens) Tokens {
 	// Ultimately we want a slice of token _pointers_, but since we can
 	// predict how much memory we're going to devote to tokens we'll allocate
 	// it all as a single flat buffer and thus give the GC less work to do.
 	tokBuf := make([]Token, len(nativeTokens))
 	var lastByteOffset int
 	for i, mainToken := range nativeTokens {
 		// Create a copy of the bytes so that we can mutate without
 		// corrupting the original token stream.
 		bytes := make([]byte, len(mainToken.Bytes))
 		copy(bytes, mainToken.Bytes)

 		tokBuf[i] = Token{
 			Type:  mainToken.Type,
 			Bytes: bytes,

 			// We assume here that spaces are always ASCII spaces, since
 			// that's what the scanner also assumes, and thus the number
 			// of bytes skipped is also the number of space characters.
 			SpacesBefore: mainToken.Range.Start.Byte - lastByteOffset,
 		}

 		lastByteOffset = mainToken.Range.End.Byte
 	}

 	// Now make a slice of pointers into the previous slice.
 	ret := make(Tokens, len(tokBuf))
 	for i := range ret {
 		ret[i] = &tokBuf[i]
 	}

 	return ret
 }

 // partitionTokens takes a sequence of tokens and a hcl.Range and returns
 // two indices within the token sequence that correspond with the range
 // boundaries, such that the slice operator could be used to produce
 // three token sequences for before, within, and after respectively:
 //
 //     start, end := partitionTokens(toks, rng)
 //     before := toks[:start]
 //     within := toks[start:end]
 //     after := toks[end:]
 //
 // This works best when the range is aligned with token boundaries (e.g.
 // because it was produced in terms of the scanner's result) but if that isn't
 // true then it will make a best effort that may produce strange results at
 // the boundaries.
 //
 // Native hclsyntax tokens are used here, because they contain the necessary
 // absolute position information. However, since writerTokens produces a
 // correlatable sequence of writer tokens, the resulting indices can be
 // used also to index into its result, allowing the partitioning of writer
 // tokens to be driven by the partitioning of native tokens.
 //
 // The tokens are assumed to be in source order and non-overlapping, which
 // will be true if the token sequence from the scanner is used directly.
 func partitionTokens(toks hclsyntax.Tokens, rng hcl.Range) (start, end int) {
 	// We use a linear search here because we assume that in most cases our
 	// target range is close to the beginning of the sequence, and the sequences
 	// are generally small for most reasonable files anyway.
 	for i := 0; ; i++ {
 		if i >= len(toks) {
 			// No tokens for the given range at all!
 			return len(toks), len(toks)
 		}

 		if toks[i].Range.Start.Byte >= rng.Start.Byte {
 			start = i
 			break
 		}
 	}

 	for i := start; ; i++ {
 		if i >= len(toks) {
 			// The range "hangs off" the end of the token sequence
 			return start, len(toks)
 		}

 		if toks[i].Range.Start.Byte >= rng.End.Byte {
 			end = i // end marker is exclusive
 			break
 		}
 	}

 	return start, end
 }

 // partitionLeadCommentTokens takes a sequence of tokens that is assumed
 // to immediately precede a construct that can have lead comment tokens,
 // and returns the index into that sequence where the lead comments begin.
 //
 // Lead comments are defined as whole lines containing only comment tokens
 // with no blank lines between. If no such lines are found, the returned
 // index will be len(toks).
 func partitionLeadCommentTokens(toks hclsyntax.Tokens) int {
 	// single-line comments (which is what we're interested in here)
 	// consume their trailing newline, so we can just walk backwards
 	// until we stop seeing comment tokens.
 	for i := len(toks) - 1; i >= 0; i-- {
 		if toks[i].Type != hclsyntax.TokenComment {
 			return i + 1
 		}
 	}
 	return 0
 }

 // partitionLineEndTokens takes a sequence of tokens that is assumed
 // to immediately follow a construct that can have a line comment, and
 // returns first the index where any line comments end and then second
 // the index immediately after the trailing newline.
 //
 // Line comments are defined as comments that appear immediately after
 // a construct on the same line where its significant tokens ended.
 //
 // Since single-line comment tokens (# and //) include the newline that
 // terminates them, in the presence of these the two returned indices
 // will be the same since the comment itself serves as the line end.
 func partitionLineEndTokens(toks hclsyntax.Tokens) (afterComment, afterNewline int) {
 	for i := 0; i < len(toks); i++ {
 		tok := toks[i]
 		if tok.Type != hclsyntax.TokenComment {
 			switch tok.Type {
 			case hclsyntax.TokenNewline:
 				return i, i + 1
 			case hclsyntax.TokenEOF:
 				// Although this is valid, we mustn't include the EOF
 				// itself as our "newline" or else strange things will
 				// happen when we try to append new items.
 				return i, i
 			default:
 				// If we have well-formed input here then nothing else should be
 				// possible. This path should never happen, because we only try
 				// to extract tokens from the sequence if the parser succeeded,
 				// and it should catch this problem itself.
 				panic("malformed line trailers: expected only comments and newlines")
 			}
 		}

 		if len(tok.Bytes) > 0 && tok.Bytes[len(tok.Bytes)-1] == '\n' {
 			// Newline at the end of a single-line comment serves both as
 			// the end of comments *and* the end of the line.
 			return i + 1, i + 1
 		}
 	}
 	return len(toks), len(toks)
 }

 // lexConfig uses the hclsyntax scanner to get a token stream and then
 // rewrites it into this package's token model.
 //
 // Any errors produced during scanning are ignored, so the results of this
 // function should be used with care.
 func lexConfig(src []byte) Tokens {
 	mainTokens, _ := hclsyntax.LexConfig(src, "", hcl.Pos{Byte: 0, Line: 1, Column: 1})
 	return writerTokens(mainTokens)
 }