PageRenderTime 50ms CodeModel.GetById 18ms app.highlight 28ms RepoModel.GetById 1ms app.codeStats 0ms

/indra/lib/python/indra/util/simperf_host_xml_parser.py

https://bitbucket.org/lindenlab/viewer-beta/
Python | 338 lines | 321 code | 6 blank | 11 comment | 9 complexity | 73165cc82ced5d7fc1eca9fb35aad11c MD5 | raw file
  1#!/usr/bin/env python
  2"""\
  3@file simperf_host_xml_parser.py
  4@brief Digest collector's XML dump and convert to simple dict/list structure
  5
  6$LicenseInfo:firstyear=2008&license=mit$
  7
  8Copyright (c) 2008-2009, Linden Research, Inc.
  9
 10Permission is hereby granted, free of charge, to any person obtaining a copy
 11of this software and associated documentation files (the "Software"), to deal
 12in the Software without restriction, including without limitation the rights
 13to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 14copies of the Software, and to permit persons to whom the Software is
 15furnished to do so, subject to the following conditions:
 16
 17The above copyright notice and this permission notice shall be included in
 18all copies or substantial portions of the Software.
 19
 20THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 21IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 22FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 23AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 24LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 25OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 26THE SOFTWARE.
 27$/LicenseInfo$
 28"""
 29
 30import sys, os, getopt, time
 31import simplejson
 32from xml import sax
 33
 34
 35def usage():
 36    print "Usage:"
 37    print sys.argv[0] + " [options]"
 38    print "  Convert RRD's XML dump to JSON.  Script to convert the simperf_host_collector-"
 39    print "  generated RRD dump into JSON.  Steps include converting selected named"
 40    print "  fields from GAUGE type to COUNTER type by computing delta with preceding"
 41    print "  values.  Top-level named fields are:"
 42    print 
 43    print "     lastupdate      Time (javascript timestamp) of last data sample"
 44    print "     step            Time in seconds between samples"
 45    print "     ds              Data specification (name/type) for each column"
 46    print "     database        Table of data samples, one time step per row"
 47    print 
 48    print "Options:"
 49    print "  -i, --in      Input settings filename.  (Default:  stdin)"
 50    print "  -o, --out     Output settings filename.  (Default:  stdout)"
 51    print "  -h, --help    Print this message and exit."
 52    print
 53    print "Example: %s -i rrddump.xml -o rrddump.json" % sys.argv[0]
 54    print
 55    print "Interfaces:"
 56    print "   class SimPerfHostXMLParser()         # SAX content handler"
 57    print "   def simperf_host_xml_fixup(parser)   # post-parse value fixup"
 58
 59class SimPerfHostXMLParser(sax.handler.ContentHandler):
 60
 61    def __init__(self):
 62        pass
 63        
 64    def startDocument(self):
 65        self.rrd_last_update = 0         # public
 66        self.rrd_step = 0                # public
 67        self.rrd_ds = []                 # public
 68        self.rrd_records = []            # public
 69        self._rrd_level = 0
 70        self._rrd_parse_state = 0
 71        self._rrd_chars = ""
 72        self._rrd_capture = False
 73        self._rrd_ds_val = {}
 74        self._rrd_data_row = []
 75        self._rrd_data_row_has_nan = False
 76        
 77    def endDocument(self):
 78        pass
 79
 80    # Nasty little ad-hoc state machine to extract the elements that are
 81    # necessary from the 'rrdtool dump' XML output.  The same element
 82    # name '<ds>' is used for two different data sets so we need to pay
 83    # some attention to the actual structure to get the ones we want
 84    # and ignore the ones we don't.
 85    
 86    def startElement(self, name, attrs):
 87        self._rrd_level = self._rrd_level + 1
 88        self._rrd_capture = False
 89        if self._rrd_level == 1:
 90            if name == "rrd" and self._rrd_parse_state == 0:
 91                self._rrd_parse_state = 1     # In <rrd>
 92                self._rrd_capture = True
 93                self._rrd_chars = ""
 94        elif self._rrd_level == 2:
 95            if self._rrd_parse_state == 1:
 96                if name == "lastupdate":
 97                    self._rrd_parse_state = 2         # In <rrd><lastupdate>
 98                    self._rrd_capture = True
 99                    self._rrd_chars = ""
100                elif name == "step":
101                    self._rrd_parse_state = 3         # In <rrd><step>
102                    self._rrd_capture = True
103                    self._rrd_chars = ""
104                elif name == "ds":
105                    self._rrd_parse_state = 4         # In <rrd><ds>
106                    self._rrd_ds_val = {}
107                    self._rrd_chars = ""
108                elif name == "rra":
109                    self._rrd_parse_state = 5         # In <rrd><rra>
110        elif self._rrd_level == 3:
111            if self._rrd_parse_state == 4:
112                if name == "name":
113                    self._rrd_parse_state = 6         # In <rrd><ds><name>
114                    self._rrd_capture = True
115                    self._rrd_chars = ""
116                elif name == "type":
117                    self._rrd_parse_state = 7         # In <rrd><ds><type>
118                    self._rrd_capture = True
119                    self._rrd_chars = ""
120            elif self._rrd_parse_state == 5:
121                if name == "database":
122                    self._rrd_parse_state = 8         # In <rrd><rra><database>
123        elif self._rrd_level == 4:
124            if self._rrd_parse_state == 8:
125                if name == "row":
126                    self._rrd_parse_state = 9         # In <rrd><rra><database><row>
127                    self._rrd_data_row = []
128                    self._rrd_data_row_has_nan = False
129        elif self._rrd_level == 5:
130            if self._rrd_parse_state == 9:
131                if name == "v":
132                    self._rrd_parse_state = 10        # In <rrd><rra><database><row><v>
133                    self._rrd_capture = True
134                    self._rrd_chars = ""
135
136    def endElement(self, name):
137        self._rrd_capture = False
138        if self._rrd_parse_state == 10:
139            self._rrd_capture = self._rrd_level == 6
140            if self._rrd_level == 5:
141                if self._rrd_chars == "NaN":
142                    self._rrd_data_row_has_nan = True
143                else:
144                    self._rrd_data_row.append(self._rrd_chars)
145                self._rrd_parse_state = 9              # In <rrd><rra><database><row>
146        elif self._rrd_parse_state == 9:
147            if self._rrd_level == 4:
148                if not self._rrd_data_row_has_nan:
149                    self.rrd_records.append(self._rrd_data_row)
150                self._rrd_parse_state = 8              # In <rrd><rra><database>
151        elif self._rrd_parse_state == 8:
152            if self._rrd_level == 3:
153                self._rrd_parse_state = 5              # In <rrd><rra>
154        elif self._rrd_parse_state == 7:
155            if self._rrd_level == 3:
156                self._rrd_ds_val["type"] = self._rrd_chars
157                self._rrd_parse_state = 4              # In <rrd><ds>
158        elif self._rrd_parse_state == 6:
159            if self._rrd_level == 3:
160                self._rrd_ds_val["name"] = self._rrd_chars
161                self._rrd_parse_state = 4              # In <rrd><ds>
162        elif self._rrd_parse_state == 5:
163            if self._rrd_level == 2:
164                self._rrd_parse_state = 1              # In <rrd>
165        elif self._rrd_parse_state == 4:
166            if self._rrd_level == 2:
167                self.rrd_ds.append(self._rrd_ds_val)
168                self._rrd_parse_state = 1              # In <rrd>
169        elif self._rrd_parse_state == 3:
170            if self._rrd_level == 2:
171                self.rrd_step = long(self._rrd_chars)
172                self._rrd_parse_state = 1              # In <rrd>
173        elif self._rrd_parse_state == 2:
174            if self._rrd_level == 2:
175                self.rrd_last_update = long(self._rrd_chars)
176                self._rrd_parse_state = 1              # In <rrd>
177        elif self._rrd_parse_state == 1:
178            if self._rrd_level == 1:
179                self._rrd_parse_state = 0              # At top
180                
181        if self._rrd_level:
182            self._rrd_level = self._rrd_level - 1
183
184    def characters(self, content):
185        if self._rrd_capture:
186            self._rrd_chars = self._rrd_chars + content.strip()
187
188def _make_numeric(value):
189    try:
190        value = float(value)
191    except:
192        value = ""
193    return value
194
195def simperf_host_xml_fixup(parser, filter_start_time = None, filter_end_time = None):
196    # Fixup for GAUGE fields that are really COUNTS.  They
197    # were forced to GAUGE to try to disable rrdtool's
198    # data interpolation/extrapolation for non-uniform time
199    # samples.
200    fixup_tags = [ "cpu_user",
201                   "cpu_nice",
202                   "cpu_sys",
203                   "cpu_idle",
204                   "cpu_waitio",
205                   "cpu_intr",
206                   # "file_active",
207                   # "file_free",
208                   # "inode_active",
209                   # "inode_free",
210                   "netif_in_kb",
211                   "netif_in_pkts",
212                   "netif_in_errs",
213                   "netif_in_drop",
214                   "netif_out_kb",
215                   "netif_out_pkts",
216                   "netif_out_errs",
217                   "netif_out_drop",
218                   "vm_page_in",
219                   "vm_page_out",
220                   "vm_swap_in",
221                   "vm_swap_out",
222                   #"vm_mem_total",
223                   #"vm_mem_used",
224                   #"vm_mem_active",
225                   #"vm_mem_inactive",
226                   #"vm_mem_free",
227                   #"vm_mem_buffer",
228                   #"vm_swap_cache",
229                   #"vm_swap_total",
230                   #"vm_swap_used",
231                   #"vm_swap_free",
232                   "cpu_interrupts",
233                   "cpu_switches",
234                   "cpu_forks" ]
235
236    col_count = len(parser.rrd_ds)
237    row_count = len(parser.rrd_records)
238
239    # Process the last row separately, just to make all values numeric.
240    for j in range(col_count):
241        parser.rrd_records[row_count - 1][j] = _make_numeric(parser.rrd_records[row_count - 1][j])
242
243    # Process all other row/columns.
244    last_different_row = row_count - 1
245    current_row = row_count - 2
246    while current_row >= 0:
247        # Check for a different value than the previous row.  If everything is the same
248        # then this is probably just a filler/bogus entry.
249        is_different = False
250        for j in range(col_count):
251            parser.rrd_records[current_row][j] = _make_numeric(parser.rrd_records[current_row][j])
252            if parser.rrd_records[current_row][j] != parser.rrd_records[last_different_row][j]:
253                # We're good.  This is a different row.
254                is_different = True
255
256        if not is_different:
257            # This is a filler/bogus entry.  Just ignore it.
258            for j in range(col_count):
259                parser.rrd_records[current_row][j] = float('nan')
260        else:
261            # Some tags need to be converted into deltas.
262            for j in range(col_count):
263                if parser.rrd_ds[j]["name"] in fixup_tags:
264                    parser.rrd_records[last_different_row][j] = \
265                        parser.rrd_records[last_different_row][j] - parser.rrd_records[current_row][j]
266            last_different_row = current_row
267
268        current_row -= 1
269
270    # Set fixup_tags in the first row to 'nan' since they aren't useful anymore.
271    for j in range(col_count):
272        if parser.rrd_ds[j]["name"] in fixup_tags:
273            parser.rrd_records[0][j] = float('nan')
274
275    # Add a timestamp to each row and to the catalog.  Format and name
276    # chosen to match other simulator logging (hopefully).
277    start_time = parser.rrd_last_update - (parser.rrd_step * (row_count - 1))
278    # Build a filtered list of rrd_records if we are limited to a time range.
279    filter_records = False
280    if filter_start_time is not None or filter_end_time is not None:
281        filter_records = True
282        filtered_rrd_records = []
283        if filter_start_time is None:
284            filter_start_time = start_time * 1000
285        if filter_end_time is None:
286            filter_end_time = parser.rrd_last_update * 1000
287        
288    for i in range(row_count):
289        record_timestamp = (start_time + (i * parser.rrd_step)) * 1000
290        parser.rrd_records[i].insert(0, record_timestamp)
291        if filter_records:
292            if filter_start_time <= record_timestamp and record_timestamp <= filter_end_time:
293                filtered_rrd_records.append(parser.rrd_records[i])
294
295    if filter_records:
296        parser.rrd_records = filtered_rrd_records
297
298    parser.rrd_ds.insert(0, {"type": "GAUGE", "name": "javascript_timestamp"})
299
300
301def main(argv=None):
302    opts, args = getopt.getopt(sys.argv[1:], "i:o:h", ["in=", "out=", "help"])
303    input_file = sys.stdin
304    output_file = sys.stdout
305    for o, a in opts:
306        if o in ("-i", "--in"):
307            input_file = open(a, 'r')
308        if o in ("-o", "--out"):
309            output_file = open(a, 'w')
310        if o in ("-h", "--help"):
311            usage()
312            sys.exit(0)
313
314    # Using the SAX parser as it is at least 4X faster and far, far
315    # smaller on this dataset than the DOM-based interface in xml.dom.minidom.
316    # With SAX and a 5.4MB xml file, this requires about seven seconds of
317    # wall-clock time and 32MB VSZ.  With the DOM interface, about 22 seconds
318    # and over 270MB VSZ.
319
320    handler = SimPerfHostXMLParser()
321    sax.parse(input_file, handler)
322    if input_file != sys.stdin:
323        input_file.close()
324
325    # Various format fixups:  string-to-num, gauge-to-counts, add
326    # a time stamp, etc.
327    simperf_host_xml_fixup(handler)
328    
329    # Create JSONable dict with interesting data and format/print it
330    print >>output_file, simplejson.dumps({ "step" : handler.rrd_step,
331                                            "lastupdate": handler.rrd_last_update * 1000,
332                                            "ds" : handler.rrd_ds,
333                                            "database" : handler.rrd_records })
334
335    return 0
336
337if __name__ == "__main__":
338    sys.exit(main())