tb_parser.cpp | searchcode

/Source/ThirdParty/TurboBadger/parser/tb_parser.cpp

https://gitlab.com/Teo-Mirror/AtomicGameEngine · C++ · 496 lines · 384 code · 62 blank · 50 comment · 152 complexity · 42cd3e113e429811d560bfde38582e04 MD5 · raw file

// ================================================================================
// ==      This file is a part of Turbo Badger. (C) 2011-2014, Emil Segerås      ==
// ==                     See tb_core.h for more information.                    ==
// ================================================================================

#include "parser/tb_parser.h"
#include "tb_tempbuffer.h"
#include "utf8/utf8.h"
#include <assert.h>
#include <ctype.h>

namespace tb {

// == Util functions ====================================================================

static bool is_hex(char c)
{
    return ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'));
}

static uint32 parse_hex(char *&src, int max_count)
{
    uint32 hex = 0;
    for (int i = 0; i < max_count; i++)
    {
        char c = *src;
        if (!is_hex(c))
            break;
        hex <<= 4;
        hex |= isdigit(c) ? c - '0' : tolower(c) - 'a' + 10;
        src++;
    }
    return hex;
}

void UnescapeString(char *str)
{
    // fast forward to any escape sequence
    while (*str && *str != '\\')
        str++;

    char *dst = str, *src = str;
    while (*src)
    {
        if (*src == '\\')
        {
            bool code_found = true;
            switch (src[1])
            {
            case 'a': *dst = '\a'; break;
            case 'b': *dst = '\b'; break;
            case 'f': *dst = '\f'; break;
            case 'n': *dst = '\n'; break;
            case 'r': *dst = '\r'; break;
            case 't': *dst = '\t'; break;
            case 'v': *dst = '\v'; break;
            case '0': *dst = '\0'; break;
            case '\"': *dst = '\"'; break;
            case '\'': *dst = '\''; break;
            case '\\': *dst = '\\'; break;
            case 'x': // \xXX
            case 'u': // \uXXXX
            {
                // This should be safe. A utf-8 character can be at most 4 bytes,
                // and we have 4 bytes to use for \xXX and 6 for \uXXXX.
                src += 2;
                if (UCS4 hex = parse_hex(src, src[1] == 'x' ? 2 : 4))
                    dst += utf8::encode(hex, dst);
                continue;
            }
            default:
                code_found = false;
            }
            if (code_found)
            {
                src += 2;
                dst++;
                continue;
            }
        }
        *dst = *src;
        dst++;
        src++;
    }
    *dst = 0;
}

bool is_white_space(const char *str)
{
    switch (*str)
    {
    case ' ':
    case '\t':
        return true;
    default:
        return false;
    }
}

/** Return true if the given string starts with a color.
    Ex: #ffdd00, #fd0 */
bool is_start_of_color(const char *str)
{
    if (*str++ != '#')
        return false;
    int digit_count = 0;
    while (is_hex(*str))
    {
        str++;
        digit_count++;
    }
    return digit_count == 8 || digit_count == 6 || digit_count == 4 || digit_count == 3;
}

/** Return true if the given string may be a node reference, such
    as language strings or TBNodeRefTree references. */
bool is_start_of_reference(const char *str)
{
    if (*str++ != '@')
        return false;
    while (*str && *str != ' ')
    {
        // If the token ends with colon, it's not a value but a key.
        if (*str == ':')
            return false;
        str++;
    }
    return true;
}

/** Check if the line is a comment or empty space. If it is, consume the leading
    whitespace from line. */
bool is_space_or_comment(char *&line)
{
    char *tmp = line;
    while (is_white_space(tmp))
        tmp++;
    if (*tmp == '#' || *tmp == 0)
    {
        line = tmp;
        return true;
    }
    return false;
}

bool is_pending_multiline(const char *str)
{
    while (is_white_space(str))
        str++;
    return str[0] == '\\' && str[1] == 0;
}

bool IsEndQuote(const char *buf_start, const char *buf, const char quote_type)
{
    if (*buf != quote_type)
        return false;
    int num_backslashes = 0;
    while (buf_start < buf && *(buf-- - 1) == '\\')
        num_backslashes++;
    return !(num_backslashes & 1);
}

// == Parser ============================================================================

TBParser::STATUS TBParser::Read(TBParserStream *stream, TBParserTarget *target)
{
    TBTempBuffer line, work;
    if (!line.Reserve(1024) || !work.Reserve(1024))
        return STATUS_OUT_OF_MEMORY;

    current_indent = 0;
    current_line_nr = 1;
    pending_multiline = false;
    multi_line_sub_level = 0;

    while (int read_len = stream->GetMoreData((char *)work.GetData(), work.GetCapacity()))
    {
        char *buf = work.GetData();

        // Skip BOM (BYTE ORDER MARK) character, often in the beginning of UTF-8 documents.
        if (current_line_nr == 1 && read_len > 3 &&
                (uint8)buf[0] == 239 &&
                (uint8)buf[1] == 187 &&
                (uint8)buf[2] == 191)
        {
            read_len -= 3;
            buf += 3;
        }

        int line_pos = 0;
        while (true)
        {
            // Find line end
            int line_start = line_pos;
            while (line_pos < read_len && buf[line_pos] != '\n')
                line_pos++;

            if (line_pos < read_len)
            {
                // We have a line
                // Skip preceding \r (if we have one)
                int line_len = line_pos - line_start;
                if (!line.Append(buf + line_start, line_len))
                    return STATUS_OUT_OF_MEMORY;

                // Strip away trailing '\r' if the line has it
                char *linebuf = line.GetData();
                int linebuf_len = line.GetAppendPos();
                if (linebuf_len > 0 && linebuf[linebuf_len - 1] == '\r')
                    linebuf[linebuf_len - 1] = 0;

                // Terminate the line string
                if (!line.Append("", 1))
                    return STATUS_OUT_OF_MEMORY;

                // Handle line
                OnLine(line.GetData(), target);
                current_line_nr++;

                line.ResetAppendPos();
                line_pos++; // Skip this \n
                // Find next line
                continue;
            }
            // No more lines here so push the rest and break for more data
            if (!line.Append(buf + line_start, read_len - line_start))
                return STATUS_OUT_OF_MEMORY;
            break;
        }
    }
    if (line.GetAppendPos())
    {
        if (!line.Append("", 1))
            return STATUS_OUT_OF_MEMORY;
        OnLine(line.GetData(), target);
        current_line_nr++;
    }
    return STATUS_OK;
}

void TBParser::OnLine(char *line, TBParserTarget *target)
{
    if (is_space_or_comment(line))
    {
        if (*line == '#')
            target->OnComment(current_line_nr, line + 1);
        return;
    }
    if (pending_multiline)
    {
        OnMultiline(line, target);
        return;
    }

    int i = 0;
    int tabs = 0;
    int spaces = 0;

    while (line[i] != 0)
    {
        if (line[i] == '\t')
            tabs++;
        else if (line[i] == ' ')
            spaces++;
        else
            break;
        i++;
    }

    if (spaces && indent_spaces == -1)
    {
        indent_spaces = spaces;
    }

    if ((tabs || indent_tabs) && spaces)
    {
        target->OnError(current_line_nr, "Indentation error. Mixed tabs and spaces (Line skipped)");
        return;
    }


    // Check indent
    int indent = 0;

    if (tabs)
    {
        indent_tabs = true;
        indent += tabs;
        line += tabs;
    }
    else
    {
        i = 0;
        int c = 0;
        while (line[i] == ' ' && line[i] != 0)
        {
            c++;
            i++;

            if (indent_spaces == c)
            {
                c = 0;
                indent++;
            }

        }

        line += i;
    }

    if (indent - current_indent > 1)
    {
        target->OnError(current_line_nr, "Indentation error. (Line skipped)");
        return;
    }

    if (indent > current_indent)
    {
        // FIX: Report indentation error if more than 1 higher!
        assert(indent - current_indent == 1);
        target->Enter();
        current_indent++;
    }
    else if (indent < current_indent)
    {
        while (indent < current_indent)
        {
            target->Leave();
            current_indent--;
        }
    }

    if (*line == 0)
        return;
    else
    {
        char *token = line;
        // Read line while consuming it and copy over to token buf
        while (!is_white_space(line) && *line != 0)
            line++;
        int token_len = line - token;
        // Consume any white space after the token
        while (is_white_space(line))
            line++;

        bool is_compact_line = token_len && token[token_len - 1] == ':';

        TBValue value;
        if (is_compact_line)
        {
            token_len--;
            token[token_len] = 0;

            // Check if the first argument is not a child but the value for this token
            if (*line == '[' || *line == '\"' || *line == '\'' ||
                    is_start_of_number(line) ||
                    is_start_of_color(line) ||
                    is_start_of_reference(line))
            {
                ConsumeValue(value, line);

                if (pending_multiline)
                {
                    // The value wrapped to the next line, so we should remember the token and continue.
                    multi_line_token.Set(token);
                    return;
                }
            }
        }
        else if (token[token_len])
        {
            token[token_len] = 0;
            UnescapeString(line);
            value.SetFromStringAuto(line, TBValue::SET_AS_STATIC);
        }
        target->OnToken(current_line_nr, token, value);

        if (is_compact_line)
            OnCompactLine(line, target);
    }
}

void TBParser::OnCompactLine(char *line, TBParserTarget *target)
{
    target->Enter();
    while (*line)
    {
        // consume any whitespace
        while (is_white_space(line))
            line++;

        // Find token
        char *token = line;
        while (*line != ':' && *line != 0)
            line++;
        if (!*line)
            break; // Syntax error, expected token
        *line++ = 0;

        // consume any whitespace
        while (is_white_space(line))
            line++;

        TBValue v;
        ConsumeValue(v, line);

        if (pending_multiline)
        {
            // The value wrapped to the next line, so we should remember the token and continue.
            multi_line_token.Set(token);
            // Since we need to call target->Leave when the multiline is ready, set multi_line_sub_level.
            multi_line_sub_level = 1;
            return;
        }

        // Ready
        target->OnToken(current_line_nr, token, v);
    }

    target->Leave();
}

void TBParser::OnMultiline(char *line, TBParserTarget *target)
{
    // consume any whitespace
    while (is_white_space(line))
        line++;

    TBValue value;
    ConsumeValue(value, line);

    if (!pending_multiline)
    {
        // Ready with all lines
        value.SetString(multi_line_value.GetData(), TBValue::SET_AS_STATIC);
        target->OnToken(current_line_nr, multi_line_token, value);

        if (multi_line_sub_level)
            target->Leave();

        // Reset
        multi_line_value.SetAppendPos(0);
        multi_line_sub_level = 0;
    }
}

void TBParser::ConsumeValue(TBValue &dst_value, char *&line)
{
    // Find value (As quoted string, or as auto)
    char *value = line;
    if (*line == '\"' || *line == '\'')
    {
        const char quote_type = *line;
        // Consume starting quote
        line++;
        value++;
        // Find ending quote or end
        while (!IsEndQuote(value, line, quote_type) && *line != 0)
            line++;
        // Terminate away the quote
        if (*line == quote_type)
            *line++ = 0;

        // consume any whitespace
        while (is_white_space(line))
            line++;
        // consume any comma
        if (*line == ',')
            line++;

        UnescapeString(value);
        dst_value.SetString(value, TBValue::SET_AS_STATIC);
    }
    else
    {
        // Find next comma or end
        while (*line != ',' && *line != 0)
            line++;
        // Terminate away the comma
        if (*line == ',')
            *line++ = 0;

        UnescapeString(value);
        dst_value.SetFromStringAuto(value, TBValue::SET_AS_STATIC);
    }

    // Check if we still have pending value data on the following line and set pending_multiline.
    bool continuing_multiline = pending_multiline;
    pending_multiline = is_pending_multiline(line);

    // Append the multi line value to the buffer.
    if (continuing_multiline || pending_multiline)
        multi_line_value.AppendString(dst_value.GetString());
}

}; // namespace tb