simperf_host_xml_parser.py - This Python script parses an X…

/indra/lib/python/indra/util/simperf_host_xml_parser.py

https://bitbucket.org/lindenlab/viewer-beta/ · Python · 338 lines · 261 code · 29 blank · 48 comment · 78 complexity · 73165cc82ced5d7fc1eca9fb35aad11c MD5 · raw file

#!/usr/bin/env python
"""\
@file simperf_host_xml_parser.py
@brief Digest collector's XML dump and convert to simple dict/list structure

$LicenseInfo:firstyear=2008&license=mit$

Copyright (c) 2008-2009, Linden Research, Inc.

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
$/LicenseInfo$
"""

import sys, os, getopt, time
import simplejson
from xml import sax


def usage():
    print "Usage:"
    print sys.argv[0] + " [options]"
    print "  Convert RRD's XML dump to JSON.  Script to convert the simperf_host_collector-"
    print "  generated RRD dump into JSON.  Steps include converting selected named"
    print "  fields from GAUGE type to COUNTER type by computing delta with preceding"
    print "  values.  Top-level named fields are:"
    print 
    print "     lastupdate      Time (javascript timestamp) of last data sample"
    print "     step            Time in seconds between samples"
    print "     ds              Data specification (name/type) for each column"
    print "     database        Table of data samples, one time step per row"
    print 
    print "Options:"
    print "  -i, --in      Input settings filename.  (Default:  stdin)"
    print "  -o, --out     Output settings filename.  (Default:  stdout)"
    print "  -h, --help    Print this message and exit."
    print
    print "Example: %s -i rrddump.xml -o rrddump.json" % sys.argv[0]
    print
    print "Interfaces:"
    print "   class SimPerfHostXMLParser()         # SAX content handler"
    print "   def simperf_host_xml_fixup(parser)   # post-parse value fixup"

class SimPerfHostXMLParser(sax.handler.ContentHandler):

    def __init__(self):
        pass
        
    def startDocument(self):
        self.rrd_last_update = 0         # public
        self.rrd_step = 0                # public
        self.rrd_ds = []                 # public
        self.rrd_records = []            # public
        self._rrd_level = 0
        self._rrd_parse_state = 0
        self._rrd_chars = ""
        self._rrd_capture = False
        self._rrd_ds_val = {}
        self._rrd_data_row = []
        self._rrd_data_row_has_nan = False
        
    def endDocument(self):
        pass

    # Nasty little ad-hoc state machine to extract the elements that are
    # necessary from the 'rrdtool dump' XML output.  The same element
    # name '<ds>' is used for two different data sets so we need to pay
    # some attention to the actual structure to get the ones we want
    # and ignore the ones we don't.
    
    def startElement(self, name, attrs):
        self._rrd_level = self._rrd_level + 1
        self._rrd_capture = False
        if self._rrd_level == 1:
            if name == "rrd" and self._rrd_parse_state == 0:
                self._rrd_parse_state = 1     # In <rrd>
                self._rrd_capture = True
                self._rrd_chars = ""
        elif self._rrd_level == 2:
            if self._rrd_parse_state == 1:
                if name == "lastupdate":
                    self._rrd_parse_state = 2         # In <rrd><lastupdate>
                    self._rrd_capture = True
                    self._rrd_chars = ""
                elif name == "step":
                    self._rrd_parse_state = 3         # In <rrd><step>
                    self._rrd_capture = True
                    self._rrd_chars = ""
                elif name == "ds":
                    self._rrd_parse_state = 4         # In <rrd><ds>
                    self._rrd_ds_val = {}
                    self._rrd_chars = ""
                elif name == "rra":
                    self._rrd_parse_state = 5         # In <rrd><rra>
        elif self._rrd_level == 3:
            if self._rrd_parse_state == 4:
                if name == "name":
                    self._rrd_parse_state = 6         # In <rrd><ds><name>
                    self._rrd_capture = True
                    self._rrd_chars = ""
                elif name == "type":
                    self._rrd_parse_state = 7         # In <rrd><ds><type>
                    self._rrd_capture = True
                    self._rrd_chars = ""
            elif self._rrd_parse_state == 5:
                if name == "database":
                    self._rrd_parse_state = 8         # In <rrd><rra><database>
        elif self._rrd_level == 4:
            if self._rrd_parse_state == 8:
                if name == "row":
                    self._rrd_parse_state = 9         # In <rrd><rra><database><row>
                    self._rrd_data_row = []
                    self._rrd_data_row_has_nan = False
        elif self._rrd_level == 5:
            if self._rrd_parse_state == 9:
                if name == "v":
                    self._rrd_parse_state = 10        # In <rrd><rra><database><row><v>
                    self._rrd_capture = True
                    self._rrd_chars = ""

    def endElement(self, name):
        self._rrd_capture = False
        if self._rrd_parse_state == 10:
            self._rrd_capture = self._rrd_level == 6
            if self._rrd_level == 5:
                if self._rrd_chars == "NaN":
                    self._rrd_data_row_has_nan = True
                else:
                    self._rrd_data_row.append(self._rrd_chars)
                self._rrd_parse_state = 9              # In <rrd><rra><database><row>
        elif self._rrd_parse_state == 9:
            if self._rrd_level == 4:
                if not self._rrd_data_row_has_nan:
                    self.rrd_records.append(self._rrd_data_row)
                self._rrd_parse_state = 8              # In <rrd><rra><database>
        elif self._rrd_parse_state == 8:
            if self._rrd_level == 3:
                self._rrd_parse_state = 5              # In <rrd><rra>
        elif self._rrd_parse_state == 7:
            if self._rrd_level == 3:
                self._rrd_ds_val["type"] = self._rrd_chars
                self._rrd_parse_state = 4              # In <rrd><ds>
        elif self._rrd_parse_state == 6:
            if self._rrd_level == 3:
                self._rrd_ds_val["name"] = self._rrd_chars
                self._rrd_parse_state = 4              # In <rrd><ds>
        elif self._rrd_parse_state == 5:
            if self._rrd_level == 2:
                self._rrd_parse_state = 1              # In <rrd>
        elif self._rrd_parse_state == 4:
            if self._rrd_level == 2:
                self.rrd_ds.append(self._rrd_ds_val)
                self._rrd_parse_state = 1              # In <rrd>
        elif self._rrd_parse_state == 3:
            if self._rrd_level == 2:
                self.rrd_step = long(self._rrd_chars)
                self._rrd_parse_state = 1              # In <rrd>
        elif self._rrd_parse_state == 2:
            if self._rrd_level == 2:
                self.rrd_last_update = long(self._rrd_chars)
                self._rrd_parse_state = 1              # In <rrd>
        elif self._rrd_parse_state == 1:
            if self._rrd_level == 1:
                self._rrd_parse_state = 0              # At top
                
        if self._rrd_level:
            self._rrd_level = self._rrd_level - 1

    def characters(self, content):
        if self._rrd_capture:
            self._rrd_chars = self._rrd_chars + content.strip()

def _make_numeric(value):
    try:
        value = float(value)
    except:
        value = ""
    return value

def simperf_host_xml_fixup(parser, filter_start_time = None, filter_end_time = None):
    # Fixup for GAUGE fields that are really COUNTS.  They
    # were forced to GAUGE to try to disable rrdtool's
    # data interpolation/extrapolation for non-uniform time
    # samples.
    fixup_tags = [ "cpu_user",
                   "cpu_nice",
                   "cpu_sys",
                   "cpu_idle",
                   "cpu_waitio",
                   "cpu_intr",
                   # "file_active",
                   # "file_free",
                   # "inode_active",
                   # "inode_free",
                   "netif_in_kb",
                   "netif_in_pkts",
                   "netif_in_errs",
                   "netif_in_drop",
                   "netif_out_kb",
                   "netif_out_pkts",
                   "netif_out_errs",
                   "netif_out_drop",
                   "vm_page_in",
                   "vm_page_out",
                   "vm_swap_in",
                   "vm_swap_out",
                   #"vm_mem_total",
                   #"vm_mem_used",
                   #"vm_mem_active",
                   #"vm_mem_inactive",
                   #"vm_mem_free",
                   #"vm_mem_buffer",
                   #"vm_swap_cache",
                   #"vm_swap_total",
                   #"vm_swap_used",
                   #"vm_swap_free",
                   "cpu_interrupts",
                   "cpu_switches",
                   "cpu_forks" ]

    col_count = len(parser.rrd_ds)
    row_count = len(parser.rrd_records)

    # Process the last row separately, just to make all values numeric.
    for j in range(col_count):
        parser.rrd_records[row_count - 1][j] = _make_numeric(parser.rrd_records[row_count - 1][j])

    # Process all other row/columns.
    last_different_row = row_count - 1
    current_row = row_count - 2
    while current_row >= 0:
        # Check for a different value than the previous row.  If everything is the same
        # then this is probably just a filler/bogus entry.
        is_different = False
        for j in range(col_count):
            parser.rrd_records[current_row][j] = _make_numeric(parser.rrd_records[current_row][j])
            if parser.rrd_records[current_row][j] != parser.rrd_records[last_different_row][j]:
                # We're good.  This is a different row.
                is_different = True

        if not is_different:
            # This is a filler/bogus entry.  Just ignore it.
            for j in range(col_count):
                parser.rrd_records[current_row][j] = float('nan')
        else:
            # Some tags need to be converted into deltas.
            for j in range(col_count):
                if parser.rrd_ds[j]["name"] in fixup_tags:
                    parser.rrd_records[last_different_row][j] = \
                        parser.rrd_records[last_different_row][j] - parser.rrd_records[current_row][j]
            last_different_row = current_row

        current_row -= 1

    # Set fixup_tags in the first row to 'nan' since they aren't useful anymore.
    for j in range(col_count):
        if parser.rrd_ds[j]["name"] in fixup_tags:
            parser.rrd_records[0][j] = float('nan')

    # Add a timestamp to each row and to the catalog.  Format and name
    # chosen to match other simulator logging (hopefully).
    start_time = parser.rrd_last_update - (parser.rrd_step * (row_count - 1))
    # Build a filtered list of rrd_records if we are limited to a time range.
    filter_records = False
    if filter_start_time is not None or filter_end_time is not None:
        filter_records = True
        filtered_rrd_records = []
        if filter_start_time is None:
            filter_start_time = start_time * 1000
        if filter_end_time is None:
            filter_end_time = parser.rrd_last_update * 1000
        
    for i in range(row_count):
        record_timestamp = (start_time + (i * parser.rrd_step)) * 1000
        parser.rrd_records[i].insert(0, record_timestamp)
        if filter_records:
            if filter_start_time <= record_timestamp and record_timestamp <= filter_end_time:
                filtered_rrd_records.append(parser.rrd_records[i])

    if filter_records:
        parser.rrd_records = filtered_rrd_records

    parser.rrd_ds.insert(0, {"type": "GAUGE", "name": "javascript_timestamp"})


def main(argv=None):
    opts, args = getopt.getopt(sys.argv[1:], "i:o:h", ["in=", "out=", "help"])
    input_file = sys.stdin
    output_file = sys.stdout
    for o, a in opts:
        if o in ("-i", "--in"):
            input_file = open(a, 'r')
        if o in ("-o", "--out"):
            output_file = open(a, 'w')
        if o in ("-h", "--help"):
            usage()
            sys.exit(0)

    # Using the SAX parser as it is at least 4X faster and far, far
    # smaller on this dataset than the DOM-based interface in xml.dom.minidom.
    # With SAX and a 5.4MB xml file, this requires about seven seconds of
    # wall-clock time and 32MB VSZ.  With the DOM interface, about 22 seconds
    # and over 270MB VSZ.

    handler = SimPerfHostXMLParser()
    sax.parse(input_file, handler)
    if input_file != sys.stdin:
        input_file.close()

    # Various format fixups:  string-to-num, gauge-to-counts, add
    # a time stamp, etc.
    simperf_host_xml_fixup(handler)
    
    # Create JSONable dict with interesting data and format/print it
    print >>output_file, simplejson.dumps({ "step" : handler.rrd_step,
                                            "lastupdate": handler.rrd_last_update * 1000,
                                            "ds" : handler.rrd_ds,
                                            "database" : handler.rrd_records })

    return 0

if __name__ == "__main__":
    sys.exit(main())
Summary ✨

This Python script parses an XML file containing performance data, fixes its format, and outputs a JSON object with relevant information to the console. It uses the SAX parser for efficiency and includes various fixups such as converting strings to numbers, handling gauge values, and adding timestamps. The output is a JSON string representing the parsed data in a human-readable format.
Tech Fingerprint

Alerts (23)

'def' Ensure functions have docstrings for documentation
57 301
Complexity hotspot; lines 89 to 90 (total complexity: 3)
89 90
Complexity hotspot; lines 94 to 96 (total complexity: 3)
94 95 96
Complexity hotspot; lines 110 to 112 (total complexity: 3)
110 111 112
Complexity hotspot; lines 123 to 125 (total complexity: 3)
123 124 125
Complexity hotspot; lines 129 to 131 (total complexity: 3)
129 130 131
Complexity hotspot; lines 146 to 148 (total complexity: 3)
146 147 148
Complexity hotspot; lines 291 to 292 (total complexity: 3)
291 292
'open(' Use 'with open()' to ensure Files are properly closed
307 309