text_parser.go - Go support for Protocol Buffers

/proto/text_parser.go

https://code.google.com/p/goprotobuf/ · Go · 515 lines · 451 code · 16 blank · 48 comment · 54 complexity · e17c6b611d44164afd1dcfba6c6a7101 MD5 · raw file

// Go support for Protocol Buffers - Google's data interchange format
//
// Copyright 2010 Google Inc.  All rights reserved.
// http://code.google.com/p/goprotobuf/
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//     * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

package proto

// Functions for parsing the Text protocol buffer format.
// TODO: message sets.

import (
	"fmt"
	"reflect"
	"strconv"
	"strings"
)

type ParseError struct {
	Message string
	Line    int // 1-based line number
	Offset  int // 0-based byte offset from start of input
}

func (p *ParseError) Error() string {
	if p.Line == 1 {
		// show offset only for first line
		return fmt.Sprintf("line 1.%d: %v", p.Offset, p.Message)
	}
	return fmt.Sprintf("line %d: %v", p.Line, p.Message)
}

type token struct {
	value    string
	err      *ParseError
	line     int    // line number
	offset   int    // byte number from start of input, not start of line
	unquoted string // the unquoted version of value, if it was a quoted string
}

func (t *token) String() string {
	if t.err == nil {
		return fmt.Sprintf("%q (line=%d, offset=%d)", t.value, t.line, t.offset)
	}
	return fmt.Sprintf("parse error: %v", t.err)
}

type textParser struct {
	s            string // remaining input
	done         bool   // whether the parsing is finished (success or error)
	backed       bool   // whether back() was called
	offset, line int
	cur          token
}

func newTextParser(s string) *textParser {
	p := new(textParser)
	p.s = s
	p.line = 1
	p.cur.line = 1
	return p
}

func (p *textParser) errorf(format string, a ...interface{}) *ParseError {
	pe := &ParseError{fmt.Sprintf(format, a...), p.cur.line, p.cur.offset}
	p.cur.err = pe
	p.done = true
	return pe
}

// Numbers and identifiers are matched by [-+._A-Za-z0-9]
func isIdentOrNumberChar(c byte) bool {
	switch {
	case 'A' <= c && c <= 'Z', 'a' <= c && c <= 'z':
		return true
	case '0' <= c && c <= '9':
		return true
	}
	switch c {
	case '-', '+', '.', '_':
		return true
	}
	return false
}

func isWhitespace(c byte) bool {
	switch c {
	case ' ', '\t', '\n', '\r':
		return true
	}
	return false
}

func (p *textParser) skipWhitespace() {
	i := 0
	for i < len(p.s) && (isWhitespace(p.s[i]) || p.s[i] == '#') {
		if p.s[i] == '#' {
			// comment; skip to end of line or input
			for i < len(p.s) && p.s[i] != '\n' {
				i++
			}
			if i == len(p.s) {
				break
			}
		}
		if p.s[i] == '\n' {
			p.line++
		}
		i++
	}
	p.offset += i
	p.s = p.s[i:len(p.s)]
	if len(p.s) == 0 {
		p.done = true
	}
}

func (p *textParser) advance() {
	// Skip whitespace
	p.skipWhitespace()
	if p.done {
		return
	}

	// Start of non-whitespace
	p.cur.err = nil
	p.cur.offset, p.cur.line = p.offset, p.line
	p.cur.unquoted = ""
	switch p.s[0] {
	case '<', '>', '{', '}', ':', '[', ']':
		// Single symbol
		p.cur.value, p.s = p.s[0:1], p.s[1:len(p.s)]
	case '"':
		// Quoted string
		i := 1
		for i < len(p.s) && p.s[i] != '"' && p.s[i] != '\n' {
			if p.s[i] == '\\' && i+1 < len(p.s) {
				// skip escaped char
				i++
			}
			i++
		}
		if i >= len(p.s) || p.s[i] != '"' {
			p.errorf("unmatched quote")
			return
		}
		unq, err := unquoteC(p.s[0 : i+1])
		if err != nil {
			p.errorf("invalid quoted string %v", p.s[0:i+1])
			return
		}
		p.cur.value, p.s = p.s[0:i+1], p.s[i+1:len(p.s)]
		p.cur.unquoted = unq
	default:
		i := 0
		for i < len(p.s) && isIdentOrNumberChar(p.s[i]) {
			i++
		}
		if i == 0 {
			p.errorf("unexpected byte %#x", p.s[0])
			return
		}
		p.cur.value, p.s = p.s[0:i], p.s[i:len(p.s)]
	}
	p.offset += len(p.cur.value)
}

func unquoteC(s string) (string, error) {
	// A notable divergence between quoted string literals in Go
	// and what is acceptable for text format protocol buffers:
	// the former considers \' invalid, but the latter considers it valid.
	s = strings.Replace(s, `\'`, "'", -1)
	return strconv.Unquote(s)
}

// Back off the parser by one token. Can only be done between calls to next().
// It makes the next advance() a no-op.
func (p *textParser) back() { p.backed = true }

// Advances the parser and returns the new current token.
func (p *textParser) next() *token {
	if p.backed || p.done {
		p.backed = false
		return &p.cur
	}
	p.advance()
	if p.done {
		p.cur.value = ""
	} else if len(p.cur.value) > 0 && p.cur.value[0] == '"' {
		// Look for multiple quoted strings separated by whitespace,
		// and concatenate them.
		cat := p.cur
		for {
			p.skipWhitespace()
			if p.done || p.s[0] != '"' {
				break
			}
			p.advance()
			if p.cur.err != nil {
				return &p.cur
			}
			cat.value += " " + p.cur.value
			cat.unquoted += p.cur.unquoted
		}
		p.done = false // parser may have seen EOF, but we want to return cat
		p.cur = cat
	}
	return &p.cur
}

// Return an error indicating which required field was not set.
func (p *textParser) missingRequiredFieldError(sv reflect.Value) *ParseError {
	st := sv.Type()
	sprops := GetProperties(st)
	for i := 0; i < st.NumField(); i++ {
		if !isNil(sv.Field(i)) {
			continue
		}

		props := sprops.Prop[i]
		if props.Required {
			return p.errorf("message %v missing required field %q", st, props.OrigName)
		}
	}
	return p.errorf("message %v missing required field", st) // should not happen
}

// Returns the index in the struct for the named field, as well as the parsed tag properties.
func structFieldByName(st reflect.Type, name string) (int, *Properties, bool) {
	sprops := GetProperties(st)
	i, ok := sprops.origNames[name]
	if ok {
		return i, sprops.Prop[i], true
	}
	return -1, nil, false
}

// Consume a ':' from the input stream (if the next token is a colon),
// returning an error if a colon is needed but not present.
func (p *textParser) checkForColon(props *Properties, typ reflect.Type) *ParseError {
	tok := p.next()
	if tok.err != nil {
		return tok.err
	}
	if tok.value != ":" {
		// Colon is optional when the field is a group or message.
		needColon := true
		switch props.Wire {
		case "group":
			needColon = false
		case "bytes":
			// A "bytes" field is either a message, a string, or a repeated field;
			// those three become *T, *string and []T respectively, so we can check for
			// this field being a pointer to a non-string.
			if typ.Kind() == reflect.Ptr {
				// *T or *string
				if typ.Elem().Kind() == reflect.String {
					break
				}
			} else if typ.Kind() == reflect.Slice {
				// []T or []*T
				if typ.Elem().Kind() != reflect.Ptr {
					break
				}
			}
			needColon = false
		}
		if needColon {
			return p.errorf("expected ':', found %q", tok.value)
		}
		p.back()
	}
	return nil
}

func (p *textParser) readStruct(sv reflect.Value, terminator string) *ParseError {
	st := sv.Type()
	reqCount := GetProperties(st).reqCount
	// A struct is a sequence of "name: value", terminated by one of
	// '>' or '}', or the end of the input.  A name may also be
	// "[extension]".
	for {
		tok := p.next()
		if tok.err != nil {
			return tok.err
		}
		if tok.value == terminator {
			break
		}
		if tok.value == "[" {
			// Looks like an extension.
			//
			// TODO: Check whether we need to handle
			// namespace rooted names (e.g. ".something.Foo").
			tok = p.next()
			if tok.err != nil {
				return tok.err
			}
			var desc *ExtensionDesc
			// This could be faster, but it's functional.
			// TODO: Do something smarter than a linear scan.
			for _, d := range RegisteredExtensions(reflect.New(st).Interface()) {
				if d.Name == tok.value {
					desc = d
					break
				}
			}
			if desc == nil {
				return p.errorf("unrecognized extension %q", tok.value)
			}
			// Check the extension terminator.
			tok = p.next()
			if tok.err != nil {
				return tok.err
			}
			if tok.value != "]" {
				return p.errorf("unrecognized extension terminator %q", tok.value)
			}

			props := &Properties{}
			props.Parse(desc.Tag)

			typ := reflect.TypeOf(desc.ExtensionType)
			if err := p.checkForColon(props, typ); err != nil {
				return err
			}

			// Read the extension structure, and set it in
			// the value we're constructing.
			ext := reflect.New(typ).Elem()
			if err := p.readAny(ext, props); err != nil {
				return err
			}
			SetExtension(sv.Addr().Interface().(extendableProto),
				desc, ext.Interface())
		} else {
			// This is a normal, non-extension field.
			fi, props, ok := structFieldByName(st, tok.value)
			if !ok {
				return p.errorf("unknown field name %q in %v", tok.value, st)
			}

			// Check that it's not already set if it's not a repeated field.
			if !props.Repeated && !isNil(sv.Field(fi)) {
				return p.errorf("non-repeated field %q was repeated", tok.value)
			}

			if err := p.checkForColon(props, st.Field(fi).Type); err != nil {
				return err
			}

			// Parse into the field.
			if err := p.readAny(sv.Field(fi), props); err != nil {
				return err
			}

			if props.Required {
				reqCount--
			}
		}
	}

	if reqCount > 0 {
		return p.missingRequiredFieldError(sv)
	}
	return nil
}

func (p *textParser) readAny(v reflect.Value, props *Properties) *ParseError {
	tok := p.next()
	if tok.err != nil {
		return tok.err
	}
	if tok.value == "" {
		return p.errorf("unexpected EOF")
	}

	switch fv := v; fv.Kind() {
	case reflect.Slice:
		at := v.Type()
		if at.Elem().Kind() == reflect.Uint8 {
			// Special case for []byte
			if tok.value[0] != '"' {
				// Deliberately written out here, as the error after
				// this switch statement would write "invalid []byte: ...",
				// which is not as user-friendly.
				return p.errorf("invalid string: %v", tok.value)
			}
			bytes := []byte(tok.unquoted)
			fv.Set(reflect.ValueOf(bytes))
			return nil
		}
		// Repeated field. May already exist.
		flen := fv.Len()
		if flen == fv.Cap() {
			nav := reflect.MakeSlice(at, flen, 2*flen+1)
			reflect.Copy(nav, fv)
			fv.Set(nav)
		}
		fv.SetLen(flen + 1)

		// Read one.
		p.back()
		return p.readAny(fv.Index(flen), props)
	case reflect.Bool:
		// Either "true", "false", 1 or 0.
		switch tok.value {
		case "true", "1":
			fv.SetBool(true)
			return nil
		case "false", "0":
			fv.SetBool(false)
			return nil
		}
	case reflect.Float32, reflect.Float64:
		if f, err := strconv.ParseFloat(tok.value, fv.Type().Bits()); err == nil {
			fv.SetFloat(f)
			return nil
		}
	case reflect.Int32:
		if x, err := strconv.ParseInt(tok.value, 10, 32); err == nil {
			fv.SetInt(x)
			return nil
		}
		if len(props.Enum) == 0 {
			break
		}
		m, ok := enumValueMaps[props.Enum]
		if !ok {
			break
		}
		x, ok := m[tok.value]
		if !ok {
			break
		}
		fv.SetInt(int64(x))
		return nil
	case reflect.Int64:
		if x, err := strconv.ParseInt(tok.value, 10, 64); err == nil {
			fv.SetInt(x)
			return nil
		}
	case reflect.Ptr:
		// A basic field (indirected through pointer), or a repeated message/group
		p.back()
		fv.Set(reflect.New(fv.Type().Elem()))
		return p.readAny(fv.Elem(), props)
	case reflect.String:
		if tok.value[0] == '"' {
			fv.SetString(tok.unquoted)
			return nil
		}
	case reflect.Struct:
		var terminator string
		switch tok.value {
		case "{":
			terminator = "}"
		case "<":
			terminator = ">"
		default:
			return p.errorf("expected '{' or '<', found %q", tok.value)
		}
		return p.readStruct(fv, terminator)
	case reflect.Uint32:
		if x, err := strconv.ParseUint(tok.value, 10, 32); err == nil {
			fv.SetUint(uint64(x))
			return nil
		}
	case reflect.Uint64:
		if x, err := strconv.ParseUint(tok.value, 10, 64); err == nil {
			fv.SetUint(x)
			return nil
		}
	}
	return p.errorf("invalid %v: %v", v.Type(), tok.value)
}

var notPtrStruct error = &ParseError{"destination is not a pointer to a struct", 0, 0}

// UnmarshalText reads a protobuffer in Text format.
func UnmarshalText(s string, pb interface{}) error {
	v := reflect.ValueOf(pb)
	if v.Kind() != reflect.Ptr || v.Elem().Kind() != reflect.Struct {
		return notPtrStruct
	}
	if pe := newTextParser(s).readStruct(v.Elem(), ""); pe != nil {
		return pe
	}
	return nil
}
Tech Fingerprint

Alerts (4)

'interface{}' Empty interface; prefer specific types or generics for type safety
89
Complexity hotspot; lines 121 to 122 (total complexity: 6)
121 122
'for {' Infinite loop detected; ensure it has a proper exit condition (e.g., break, return) to avoid unintentional resource consumption or hangs
218