HiBench成长笔记——(9) 分析源码monitor.py
monitor.py 是主监控程序,将监控数据写入日志,并统计监控数据生成HTML统计展示页面:
- #!/usr/bin/env python2
- # Licensed to the Apache Software Foundation (ASF) under one or more
- # contributor license agreements. See the NOTICE file distributed with
- # this work for additional information regarding copyright ownership.
- # The ASF licenses this file to You under the Apache License, Version 2.0
- # (the "License"); you may not use this file except in compliance with
- # the License. You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import threading, subprocess, re, os, sys, signal, socket
- from time import sleep, time
- from contextlib import closing
- import traceback, thread
- from datetime import datetime
- from collections import namedtuple
- from pprint import pprint
- from itertools import groupby
- # Probe intervals, in seconds.
- # Warning: a value too short may get wrong results due to lack of data when system load goes high.
- # and must be float!
- PROBE_INTERVAL=float(5)
- #FIXME: use log helper later
- #log_lock = threading.Lock()
- def log(*s):
- if len(s)==1: s=s[0]
- else: s= " ".join([str(x) for x in s])
- # with log_lock:
- # with open("/home/zhihui/monitor_proc.log", 'a') as f:
- log_str = str(thread.get_ident())+":"+str(s) +'\n'
- # f.write( log_str )
- sys.stderr.write(log_str)
- entered=False
- def sig_term_handler(signo, stack):
- global entered
- global log_path
- global report_path
- global workload_title
- global bench_log_path
- global na
- if not entered:
- entered=True # FIXME: Not atomic
- else: return
- na.stop()
- generate_report(workload_title, log_path, bench_log_path, report_path)
- sys.exit(0)
- def samedir(fn):
- """
- return abspath of fn in the same directory where this python file stores
- """
- return os.path.abspath(os.path.join(os.path.dirname(__file__), fn))
- class PatchedNameTuple(object):
- def __sub__(self, other):
- assert isinstance(other, self.__class__)
- assert self[0] == other[0]
- cls = self.__class__
- return cls(self[0], *[a-b for a, b in zip(self[1:], other[1:])])
- def __div__(self, other):
- return self.__class__(self[0], *[a/other for a in self[1:]])
- def _add(self, other, override_title=None):
- if other == None: return self
- assert isinstance(other, self.__class__)
- cls = self.__class__
- title = self[0] if not override_title else override_title
- return cls(title, *[a+b for a, b in zip(self[1:], other[1:])])
- def ident(size, s):
- return "\n".join((" "*size + x for x in s.split("\n")))
- class RemoteProc(threading.Thread):
- SEP="----SEP----"
- template_debug=r"""exec('
- import time, os, sys, socket, traceback
- socket.setdefaulttimeout(1)
- def log(*x, **kw):
- with open("/home/zhihui/probe.log", kw.get("mode","a")) as f:
- f.write(repr(x)+chr(10))
- try:
- log("create socket", mode="w")
- s=socket.socket(socket.AF_INET, socket.SOCK_STREAM)
- log("bind socket")
- s.bind(("0.0.0.0",0))
- log("listen socket")
- s.listen(5)
- log("bind socket to:", s.getsockname())
- while True:
- log("accepting")
- try:
- print s.getsockname()[1]
- s2,peer=s.accept()
- break
- except socket.timeout:
- log("accept timeout, retry")
- log("accepted, peer:",peer)
- except Exception as e:
- import traceback
- log(traceback.format_exc())
- {func_template}
- while True:
- s2.send(("{SEP}+%s" % time.time())+chr(10))
- {call_template}
- s2.send("{SEP}#end"+chr(10))
- time.sleep({interval})
- ')"""
- template=r"""exec('
- import time, os, sys, socket, traceback
- s=socket.socket(socket.AF_INET, socket.SOCK_STREAM)
- s.bind(("0.0.0.0",0))
- s.listen(5)
- print s.getsockname()[1]
- s2,peer=s.accept()
- {func_template}
- while True:
- s2.send(("{SEP}+%s" % time.time())+chr(10))
- {call_template}
- s2.send("{SEP}#end"+chr(10))
- time.sleep({interval})
- ')"""
- def __init__(self, host, interval=1):
- self.host = host
- self.cmds = []
- self.interval = interval
- self.monitor_ins = {}
- self.local_aggr_container={}
- self._running=True
- super(RemoteProc, self).__init__()
- def register(self, monitor_ins, cmds):
- assert isinstance(monitor_ins, BaseMonitor)
- self.monitor_ins[len(self.cmds)] = monitor_ins # monitor command seq id => monitor instance
- self.cmds.append(cmds)
- def run(self):
- func_template = "\n".join(["def func_{id}():\n{func}"\
- .format(id=id,
- func=ident(2,
- func+'\ns2.send("{SEP}={id}"+chr(10))'\
- .format(SEP=self.SEP, id=id))) \
- for id, func in enumerate(self.cmds)])
- call_template="\n".join([" func_{id}()"\
- .format(id=id) for id in range(len(self.cmds))]
- )
- script = self.template.format(func_template=func_template,
- call_template=call_template,
- interval = self.interval,
- SEP = self.SEP)
- s = script.replace('"', r'\"').replace("\n", r"\n")
- container=[]
- # log("ssh client to:", self.host)
- with self.ssh_client(self.host, "python -u -c \"{script}\"".format(script=s)) as f:
- # log("ssh client %s connected" % self.host)
- try:
- port_line = f.readline()
- # log("host:", self.host, "got port,", port_line)
- port = int(port_line.rstrip())
- s=socket.socket(socket.AF_INET, socket.SOCK_STREAM)
- s.settimeout(0.5)
- for i in range(30): # try to connect 30 times maximum
- try:
- # log("try to connect:", self.host, port)
- s.connect((self.host, port))
- # log("connectted to:", self.host, port)
- break
- except socket.timeout:
- # log("connecting to:", self.host, port, "timedout")
- pass
- else: # not connectted after 30 times trying
- # log("cann't connectted to:", self.host, port)
- s.shutdown(socket.SHUT_RDWR)
- self.ssh_close()
- return
- s.settimeout(None)
- except Exception as e:
- log(traceback.format_exc())
- with closing(s.makefile()) as f2:
- while self._running:
- try:
- l = f2.readline()
- except KeyboardInterrupt:
- break
- if not l: break
- if l.startswith(self.SEP):
- tail = l.lstrip(self.SEP)
- if tail[0]=='+': # timestamp
- remote_timestamp = float(tail[1:])
- cur_timestamp = time()
- elif tail.startswith('#end'): # end sign
- # log("na push, timestamp:", cur_timestamp)
- self.na_push(cur_timestamp)
- else:
- id = int(tail[1:])
- if self.monitor_ins[id]:
- self.monitor_ins[id].feed(container, cur_timestamp)
- container = []
- else:
- container.append(l.rstrip())
- s.shutdown(socket.SHUT_RDWR)
- self.ssh_close()
- def stop(self):
- self._running=False
- def aggregate(self, timestamp, data):
- if not self.local_aggr_container:
- self.local_aggr_container['timestamp']=timestamp
- assert timestamp == self.local_aggr_container['timestamp']
- assert type(data) is dict
- self.local_aggr_container.update(data)
- self.local_aggr_container['timestamp'] = timestamp
- def na_register(self, na):
- assert isinstance(na, NodeAggregator)
- self.node_aggr_parent = na
- def na_push(self, timestamp):
- if self.local_aggr_container:
- assert self.local_aggr_container.get('timestamp', -1) == timestamp
- self.node_aggr_parent.commit_aggregate(self.host, self.local_aggr_container)
- self.local_aggr_container={}
- class BaseMonitor(object):
- IGNORE_KEYS=[]
- def __init__(self, rproc):
- self.rproc = rproc
- self._last = None
- def feed(self, container, timestamp): # override to parse pulled data files
- raise NotImplementedError()
- def ssh_client(self, host, shell): # override for opening ssh client
- raise NotImplementedError()
- def ssh_close(self): # override for clear up ssh client
- raise NotImplementedError()
- def commit(self, timestamp, header, stat):
- if self._last is None: self._last = stat
- else:
- stat_delta = dict([(header+'/'+k, stat[k] - self._last[k]) \
- for k in set(self._last.keys()).union(set(stat.keys()))\
- if k in stat and k in self._last and k not in self.IGNORE_KEYS
- ])
- self._last = stat
- # if header.startswith("net"):
- # print stat_delta
- stat_delta[header+'/total'] = reduce_patched(lambda a,b: a._add(b, 'total'), stat_delta.values())
- self.rproc.aggregate(timestamp, stat_delta)
- class BashSSHClientMixin(object):
- ssh_lock = threading.Lock()
- def ssh_client(self, host, shell):
- with open(os.devnull, 'rb', 0) as DEVNULL:
- with BashSSHClientMixin.ssh_lock:
- self.proc = subprocess.Popen(["ssh", host, shell], bufsize=1,
- stdin=DEVNULL, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
- return self.proc.stdout
- def ssh_close(self):
- assert self.proc
- self.proc.terminate()
- self.proc.wait()
- return self.proc.returncode
- _CPU=namedtuple("CPU", ['label', 'user', 'nice', 'system', 'idle', 'iowait', 'irq', 'softirq'])
- class CPU(_CPU, PatchedNameTuple):
- def percentage(self):
- total = sum(self[1:])
- return CPU(self[0], *[x*100.0 / total for x in self[1:]]) if total>0 else self
- class CPUMonitor(BaseMonitor):
- def __init__(self, rproc):
- super(CPUMonitor, self).__init__(rproc)
- rproc.register(self, """with open("/proc/stat") as f:
- s2.send("".join([x for x in f.readlines() if x.startswith("cpu")]))
- """)
- def feed(self, container, timestamp):
- "parse /proc/stat"
- self.commit(timestamp, dict([self._parse_stat(line) for line in container]))
- def _parse_stat(self, line):
- "parse one line of /proc/stat"
- assert line.strip(), "BUG! empty line in /proc/stat"
- fields = line.split()
- if fields[0]=='cpu':
- fields[0]='total'
- return (fields[0], CPU(fields[0], *[int(x) for x in fields[1:8]]))
- def commit(self, timestamp, cpu_stat):
- if self._last is None:
- self._last = cpu_stat
- else:
- cpu_usage = dict([("cpu/"+k, (cpu_stat[k] - self._last[k]).percentage()) for k in self._last])
- self._last = cpu_stat
- self.rproc.aggregate(timestamp, cpu_usage)
- _Network=namedtuple("Network", ['label', "recv_bytes", "recv_packets", "recv_errs", "recv_drop",
- "send_bytes", "send_packets", "send_errs", "send_drop"])
- class Network(_Network, PatchedNameTuple): pass
- class NetworkMonitor(BaseMonitor):
- IGNORE_KEYS=["lo"]
- def __init__(self, rproc):
- rproc.register(self, """with open("/proc/net/dev") as f:
- s2.send("".join([x for x in f.readlines()]))
- """)
- self._filter = re.compile('^\s*(.+):\s*(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+\d+\s+\d+\s+\d+\s+\d+\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+).*$')
- super(NetworkMonitor, self).__init__(rproc)
- def feed(self, container, timestamp):
- "parse /proc/net/dev"
- self.commit(timestamp, "net", dict(filter(lambda x:x, [self._parse_net_dev(line) for line in container])))
- def _parse_net_dev(self, line):
- matched = self._filter.match(line)
- if matched:
- obj = Network(matched.groups()[0], *[int(x) for x in matched.groups()[1:]])
- if not (obj.recv_bytes==0 and obj.send_bytes==0):
- return (obj[0], obj)
- _Disk=namedtuple("Disk", ["label", "io_read", "bytes_read", "time_spent_read", "io_write", "bytes_write", "time_spent_write"])
- class Disk(_Disk, PatchedNameTuple): pass
- class DiskMonitor(BaseMonitor):
- def __init__(self, rproc):
- super(DiskMonitor, self).__init__(rproc)
- rproc.register(self, """with open("/proc/diskstats") as f:
- blocks = os.listdir("/sys/block")
- s2.send("".join([x for x in f.readlines() if x.split()[2] in blocks and not x.split()[2].startswith("loop") and x.split()[3]!="0"]))
- """)
- def feed(self, container, timestamp):
- "parse /proc/diskstats"
- self.commit(timestamp, "disk", dict([self._parse_disk_stat(line) for line in container]))
- def _parse_disk_stat(self, line):
- fields = line.split()[2:]
- obj = Disk(fields[0],
- io_read=int(fields[1]), bytes_read=int(fields[3])*512, time_spent_read=int(fields[4])/1000.0,
- io_write=int(fields[5]), bytes_write=int(fields[7])*512, time_spent_write=int(fields[8])/1000.0)
- return (obj[0], obj)
- _Memory=namedtuple("Memory", ["label", "total", "used", "buffer_cache", "free", "map"])
- class Memory(_Memory, PatchedNameTuple): pass
- class MemoryMonitor(BaseMonitor):
- def __init__(self, rproc):
- super(MemoryMonitor, self).__init__(rproc)
- rproc.register(self, """with open("/proc/meminfo") as f:
- mem = dict([(a, b.split()[0].strip()) for a, b in [x.split(":") for x in f.readlines()]])
- s2.send(":".join([mem[field] for field in ["MemTotal", "Buffers", "Cached", "MemFree", "Mapped"]])+chr(10))
- """)
- def feed(self, memory_status, timestamp):
- "parse /proc/meminfo"
- total, buffers, cached, free, mapped= [int(x) for x in memory_status[0].split(":")]
- self.rproc.aggregate(timestamp, {"memory/total":Memory(label="total", total=total,
- used=total - free - buffers-cached,
- buffer_cache=buffers + cached,
- free=free, map=mapped)})
- _Proc=namedtuple("Proc", ["label", "load5", "load10", "load15", "running", "procs"])
- class Proc(_Proc, PatchedNameTuple): pass
- class ProcMonitor(BaseMonitor):
- def __init__(self, rproc):
- super(ProcMonitor, self).__init__(rproc)
- rproc.register(self, """with open("/proc/loadavg") as f:
- s2.send(f.read())
- """)
- def feed(self, load_status, timestamp):
- "parse /proc/meminfo"
- load5, load10, load15, running_procs= load_status[0].split()[:4]
- running, procs = running_procs.split('/')
- self.rproc.aggregate(timestamp, {"proc":Proc(label="total", load5=float(load5), load10=float(load10),
- load15=float(load15), running=int(running), procs=int(procs))})
- class NodeAggregator(object):
- def __init__(self, log_name):
- self.node_pool = {}
- self.log_name = log_name
- self.log_lock = threading.Lock()
- try:
- os.unlink(self.log_name)
- except OSError:
- pass
- def append(self, node):
- assert isinstance(node, RemoteProc)
- self.node_pool[node.host] = node
- node.na_register(self)
- def commit_aggregate(self, node, datas):
- datas['hostname'] = node
- with self.log_lock:
- with file(self.log_name, "a") as f:
- f.write(repr(datas) + "\n")
- def run(self):
- for v in self.node_pool.values():
- v.start()
- def stop(self):
- for v in self.node_pool.values():
- v.stop()
- for v in self.node_pool.values():
- v.join()
- def round_to_base(v, b):
- """
- >>> round_to_base(0.1, 0.3)
- 0.0
- >>> round_to_base(0.3, 0.3)
- 0.3
- >>> round_to_base(0.0, 0.3)
- 0.0
- >>> round_to_base(0.5, 0.3)
- 0.3
- >>> round_to_base(0.51, 0.3)
- 0.3
- """
- for i in range(10):
- base = int(b * 10**i)
- if abs(base - b * 10**i) < 0.001: break
- assert base>0
- return float(int(v * 10**i) / base * base) / (10**i)
- def filter_dict_with_prefix(d, prefix, sort=True):
- keys = sorted(d.keys()) if sort else d.keys()
- if prefix[0]=='!':
- return dict([(x, d[x]) for x in keys if not x.startswith(prefix[1:])])
- else:
- return dict([(x, d[x]) for x in keys if x.startswith(prefix)])
- def reduce_patched(func, data):
- if len(data)==1:
- return data[0]
- elif len(data)==0:
- return data
- else:
- return reduce(func, data)
- def filter_dict_with_prefixes(d, *prefixes):
- if len(prefixes)==1:
- return filter_dict_with_prefix(d, prefixes[0])
- else:
- return reduce_patched(lambda a,b: filter_dict_with_prefix(filter_dict_with_prefix(d, a),b),
- prefixes)
- def test():
- p = BashSSHClientMixin()
- script=r"""exec('
- import time, os, sys
- while 1:
- with open("/proc/stat") as f: print f.read(),
- print "---hello---"
- time.sleep(1)
- ')"""
- s = script.replace('"', r'\"').replace("\n", r"\n")
- with p.ssh_client("localhost", "python -u -c \"{s}\"".format(s=s)) as f:
- while 1:
- l = f.readline()
- print l.rstrip()
- if not l: break
- p.ssh_close()
- def test2():
- class P(RemoteProc, BashSSHClientMixin): pass
- p = P("localhost", 0.3)
- CPUMonitor(p)
- NetworkMonitor(p)
- DiskMonitor(p)
- MemoryMonitor(p)
- p.run()
- def start_monitor(log_filename, nodes):
- class P(RemoteProc, BashSSHClientMixin):
- def __init__(self, *args):
- RemoteProc.__init__(self, *args)
- CPUMonitor(self)
- NetworkMonitor(self)
- DiskMonitor(self)
- MemoryMonitor(self)
- ProcMonitor(self)
- global na
- na = NodeAggregator(log_filename)
- nodes = sorted(list(set(nodes)))
- for node in nodes:
- na.append(P(node, PROBE_INTERVAL))
- na.run()
- def parse_bench_log(benchlog_fn):
- events=["x,event"]
- _spark_stage_submit = re.compile("^(\d{2}\/\d{2}\/\d{2} \d{2}:\d{2}:\d{2}) INFO [a-zA-Z0-9_\.]*DAGScheduler: Submitting (Stage \d+) \((.*)\).+$") # submit spark stage
- _spark_stage_finish = re.compile("^(\d{2}\/\d{2}\/\d{2} \d{2}:\d{2}:\d{2}) INFO [a-zA-Z0-9_\.]*DAGScheduler: (Stage \d+) \((.*)\) finished.+$") # spark stage finish
- _hadoop_run_job = re.compile("^(\d{2}\/\d{2}\/\d{2} \d{2}:\d{2}:\d{2}) INFO mapred.*\.Job.*: Running job: job_([\d_]+)$") # hadoop run job
- _hadoop_map_reduce_progress = re.compile("^(\d{2}\/\d{2}\/\d{2} \d{2}:\d{2}:\d{2}) INFO mapred.*\.Job.*:\s+map (\d{1,2})% reduce (\d{1,2})%$") # hadoop reduce progress
- _hadoop_job_complete_mr1 = re.compile("^(\d{2}\/\d{2}\/\d{2} \d{2}:\d{2}:\d{2}) INFO mapred.JobClient: Job complete: job_([\d_]+)$")
- _hadoop_job_complete_mr2 = re.compile("^(\d{2}\/\d{2}\/\d{2} \d{2}:\d{2}:\d{2}) INFO mapreduce.Job: Job job_([\d_]+) completed successfully$")
- """
- # MR1 sample
- 14/06/24 11:18:39 INFO mapred.JobClient: Running job: job_201406241116_0001
- 14/06/24 11:18:40 INFO mapred.JobClient: map 0% reduce 0%
- ...
- 13/11/21 14:38:55 INFO mapred.JobClient: Job complete: job_201311150128_0050
- # MR2 sample
- 15/04/10 17:20:01 INFO mapreduce.Job: Running job: job_1427781540447_0448
- 15/04/10 17:20:07 INFO mapreduce.Job: Job job_1427781540447_0448 running in uber mode : false
- 15/04/10 17:20:07 INFO mapreduce.Job: map 0% reduce 0%
- ...
- 15/04/10 17:20:25 INFO mapreduce.Job: Job job_1427781540447_0448 completed successfully
- """
- flag={}
- with open(benchlog_fn) as f:
- while True:
- line = f.readline().rstrip()
- if not line: break
- for rule in [_spark_stage_submit, _spark_stage_finish, _hadoop_run_job, _hadoop_map_reduce_progress, _hadoop_job_complete_mr1, _hadoop_job_complete_mr2]:
- matched = rule.match(line)
- if matched:
- result = matched.groups()
- timestamp = datetime.strptime(result[0], r" # convert to millsec for js
- if rule is _spark_stage_submit:
- events.append("{t},Start {v1} ({v2})".format(t=timestamp, v1=result[1], v2=result[2]))
- elif rule is _spark_stage_finish:
- events.append("{t},Finish {v1} ({v2})".format(t=timestamp, v1=result[1], v2=result[2]))
- elif rule is _hadoop_run_job:
- events.append("{t},Start Job {v1}".format(t=timestamp, v1=result[1]))
- flag={}
- elif rule is _hadoop_map_reduce_progress:
- map_progress,reduce_progress = int(result[1]), int(result[2])
- op={'map':False, 'reduce':False}
- if map_progress == 100:
- if not "map" in flag:
- op['map'] = True
- flag['map'] = True
- elif reduce_progress>0:
- if not 'reduce' in flag:
- op['reduce'] = True
- flag['reduce'] = True
- if op['map'] and op['reduce']:
- events.append("{t},Map finish and Reduce start".format(t=timestamp))
- elif op['map']:
- events.append("{t},Map finish".format(t=timestamp))
- elif op['reduce']:
- events.append("{t},Reduce start".format(t=timestamp))
- elif rule is _hadoop_job_complete_mr1 or rule is _hadoop_job_complete_mr2:
- events.append("{t},Finsih Job {v1}".format(t=timestamp, v1=result[1]))
- else:
- assert 0, "should never reach here"
- # limit maximum string length of events
- for i in range(len(events)):
- event_time, event_str = re.split(',', events[i], 1)
- if len(event_str) > 45:
- event_str = event_str[:21]+ '...' + event_str[-21:]
- events[i]="%s,%s" % (event_time, event_str)
- # merge events occurred at sametime:
- i = 1
- while i < len(events)-1:
- cur = events[i].split(',')[0]
- next = events[i+1].split(',')[0]
- if abs(int(cur)/1000 - int(next)/1000) < 1:
- events[i] = events[i] + "<br>" + re.split(',', events[i+1], 1)[1]
- del events[i+1]
- continue
- i += 1
- return events
- def generate_report(workload_title, log_fn, benchlog_fn, report_fn):
- c =- 1
- with open(log_fn) as f:
- datas=[eval(x) for x in f.readlines()]
- all_hosts = sorted(list(set([x['hostname'] for x in datas])))
- data_slices = groupby(datas, lambda x:round_to_base(x['timestamp'], PROBE_INTERVAL)) # round to time interval and groupby
- # Generating CSVs
- cpu_heatmap = ["x,y,value,hostname,coreid"]
- cpu_overall = ["x,idle,user,system,iowait,others"]
- network_heatmap = ["x,y,value,hostname,adapterid"]
- network_overall = ["x,recv_bytes,send_bytes,|recv_packets,send_packets,errors"]
- diskio_heatmap = ["x,y,value,hostname,diskid"]
- diskio_overall = ["x,read_bytes,write_bytes,|read_io,write_io"]
- memory_heatmap = ["x,y,value,hostname"]
- memory_overall = ["x,free,buffer_cache,used"]
- procload_heatmap = ["x,y,value,hostname"]
- procload_overall = ["x,load5,load10,load15,|running,procs"]
- events = parse_bench_log(benchlog_fn)
- cpu_count={}
- network_count={}
- diskio_count={}
- memory_count={}
- proc_count={}
- for t, sub_data in data_slices:
- classed_by_host = dict([(x['hostname'], x) for x in sub_data])
- # total cpus, plot user/sys/iowait/other
- data_by_all_hosts = [classed_by_host.get(h, {}) for h in all_hosts]
- # all cpu cores, total cluster
- summed1 = [x['cpu/total'] for x in data_by_all_hosts if x.has_key('cpu/total')]
- if summed1:
- summed = reduce_patched(lambda a,b: a._add(b), summed1) / len(summed1)
- for x in data_by_all_hosts:
- cpu = x.get('cpu/total', None)
- if not cpu: continue
- # user, system, io, idle, others
- # print t, x['hostname'], cpu.user, cpu.system, cpu.iowait, cpu.idle, cpu.nice+cpu.irq+cpu.softirq
- # print t, summed
- cpu_overall.append("{time},{idle},{user},{system},{iowait},{others}" \
- .format(time = int(t*1000), user = summed.user, system = summed.system,
- iowait = summed.iowait, idle = summed.idle,
- others = summed.nice + summed.irq + summed.softirq))
- # all cpu cores, plot heatmap according to cpus/time/usage(100%-idle)
- for idx, x in enumerate(data_by_all_hosts):
- for idy, y in enumerate(filter_dict_with_prefixes(x, "cpu", "!cpu/total").values()):
- try:
- pos = cpu_count[(idx, idy, x['hostname'])]
- except:
- pos = len(cpu_count)
- cpu_count[(idx, idy, x['hostname'])] = pos
- # print t, pos, 100-y.idle, x['hostname'], y.label
- cpu_heatmap.append("{time},{pos},{value},{host},{cpuid}" \
- .format(time = int(t*1000), pos = pos, value = 100-y.idle,
- host = x['hostname'], cpuid = y.label))
- # all disk of each node, total cluster
- summed1=[x['disk/total'] for x in data_by_all_hosts if x.has_key('disk/total')]
- if summed1:
- summed = reduce_patched(lambda a,b: a._add(b), summed1)
- for x in data_by_all_hosts:
- disk = x.get('disk/total', None)
- if not disk: continue
- # io-read, io-write, bytes-read, bytes-write
- # print t, x['hostname'], disk.io_read, disk.io_write, disk.bytes_read, disk.bytes_write
- # print t, summed
- diskio_overall.append("{time},{bytes_read},{bytes_write},{io_read},{io_write}" \
- .format(time = int(t*1000),
- bytes_read = summed.bytes_read / PROBE_INTERVAL,
- bytes_write = summed.bytes_write / PROBE_INTERVAL,
- io_read = summed.io_read / PROBE_INTERVAL,
- io_write = summed.io_write / PROBE_INTERVAL))
- # all disks, plot heatmap according to disks/bytes_read+bytes_write
- for idx, x in enumerate(data_by_all_hosts):
- for idy, y in enumerate(filter_dict_with_prefixes(x, "disk", "!disk/total").values()):
- try:
- pos = diskio_count[(idx, idy, x['hostname'])]
- except:
- pos = len(diskio_count)
- diskio_count[(idx, idy, x['hostname'])] = pos
- # print t, pos, 100-y.idle, x['hostname'], y.label
- diskio_heatmap.append("{time},{pos},{value},{host},{diskid}" \
- .format(time = int(t*1000),
- pos = pos,
- value = (y.bytes_read + y.bytes_write) / PROBE_INTERVAL,
- host = x['hostname'],
- diskid = y.label))
- # memory of each node, total cluster
- summed1 = [x['memory/total'] for x in data_by_all_hosts if x.has_key('memory/total')]
- if summed1:
- summed = reduce_patched(lambda a,b: a._add(b), summed1)
- for x in data_by_all_hosts:
- mem = x.get("memory/total", None)
- if not mem: continue
- # mem-total, mem-used, mem-buffer&cache, mem-free, KB
- # print t, x['hostname'], mem.total, mem.used, mem.buffer_cache, mem.free
- #print t, summed
- memory_overall.append("{time},{free},{buffer_cache},{used}" \
- .format(time = int(t*1000),
- free = summed.free,
- used = summed.used,
- buffer_cache = summed.buffer_cache))
- # all memory, plot heatmap according to memory/total - free
- for idx, x in enumerate(data_by_all_hosts):
- for idy, y in enumerate(filter_dict_with_prefixes(x, "memory/total").values()):
- try:
- pos = memory_count[(idx, idy, x['hostname'])]
- except:
- pos = len(memory_count)
- memory_count[(idx, idy, x['hostname'])] = pos
- # print t, pos, 100-y.idle, x['hostname'], y.label
- memory_heatmap.append("{time},{pos},{value},{host}" \
- .format(time = int(t*1000),
- pos = pos,
- value = (y.total - y.free)*1000,
- host = x['hostname']))
- # proc of each node, total cluster
- summed1 = [x['proc'] for x in data_by_all_hosts if x.has_key('proc')]
- if summed1:
- summed = reduce_patched(lambda a,b: a._add(b), summed1)
- for x in data_by_all_hosts:
- procs = x.get("proc", None)
- if not procs: continue
- procload_overall.append("{time},{load5},{load10},{load15},{running},{procs}"\
- .format(time = int(t*1000),
- load5 = summed.load5,load10=summed.load10,
- load15 = summed.load15,running=summed.running,
- procs = summed.procs))
- # all nodes' proc, plot heatmap according to proc/proc.procs
- for idx, x in enumerate(data_by_all_hosts):
- for idy, y in enumerate(filter_dict_with_prefixes(x, "proc").values()):
- try:
- pos = proc_count[(idx, idy, x['hostname'])]
- except:
- pos = len(proc_count)
- proc_count[(idx, idy, x['hostname'])] = pos
- # print t, pos, 100-y.idle, x['hostname'], y.label
- procload_heatmap.append("{time},{pos},{value},{host}" \
- .format(time = int(t*1000), pos = pos, value = y.procs,
- host = x['hostname']))
- # all network interface, total cluster
- summed1 = [x['net/total'] for x in data_by_all_hosts if x.has_key('net/total')]
- if summed1:
- summed = reduce_patched(lambda a,b: a._add(b), summed1)
- for x in data_by_all_hosts:
- net = x.get("net/total", None)
- if not net: continue
- # recv-byte, send-byte, recv-packet, send-packet, errors
- # print t, x['hostname'], net.recv_bytes, net.send_bytes, net.recv_packets, net.send_packets, net.recv_errs+net.send_errs+net.recv_drop+net.send_drop
- # print t, summed
- network_overall.append("{time},{recv_bytes},{send_bytes},{recv_packets},{send_packets},{errors}" \
- .format(time = int(t*1000),
- recv_bytes = summed.recv_bytes / PROBE_INTERVAL,
- send_bytes = summed.send_bytes / PROBE_INTERVAL,
- recv_packets = summed.recv_packets / PROBE_INTERVAL,
- send_packets = summed.send_packets / PROBE_INTERVAL,
- errors = (summed.recv_errs + summed.send_errs + \
- summed.recv_drop + summed.send_drop) / PROBE_INTERVAL)
- )
- # all network adapters, plot heatmap according to net/recv_bytes + send_bytes
- for idx, x in enumerate(data_by_all_hosts):
- for idy, y in enumerate(filter_dict_with_prefixes(x, "net", "!net/total").values()):
- try:
- pos = network_count[(idx, idy, x['hostname'])]
- except:
- pos = len(network_count)
- network_count[(idx, idy, x['hostname'])] = pos
- network_heatmap.append("{time},{pos},{value},{host},{networkid}" \
- .format(time = int(t*1000),
- pos = pos*2,
- value = y.recv_bytes / PROBE_INTERVAL,
- host = x['hostname'],
- networkid = y.label+".recv"))
- network_heatmap.append("{time},{pos},{value},{host},{networkid}" \
- .format(time = int(t*1000),
- pos = pos*2+1,
- value = y.send_bytes / PROBE_INTERVAL,
- host = x['hostname'],
- networkid = y.label+".send"))
- with open(samedir("chart-template.html")) as f:
- template = f.read()
- variables = locals()
- def my_replace(match):
- match = match.group()[1:-1]
- if match.endswith('heatmap') or match.endswith('overall'):
- return "\n".join(variables[match])
- elif match =='events':
- return "\n".join(events)
- elif match == 'probe_interval':
- return str(PROBE_INTERVAL * 1000)
- elif match == 'workload_name':
- return workload_title
- else:
- return '{%s}' % match
- with open(report_fn, 'w') as f:
- f.write(re.sub(r'{\w+}', my_replace, template))
- def show_usage():
- log("""Usage:
- monitor.py <workload_title> <parent_pid> <log_path.log> <benchlog_fn.log> <report_path.html> <monitor_node_name1> ... <monitor_node_nameN>
- """)
- if __name__=="__main__":
- if len(sys.argv)<6:
- log(sys.argv)
- show_usage()
- sys.exit(1)
- # log(sys.argv)
- global log_path
- global report_path
- global workload_title
- global bench_log_path
- global na
- workload_title = sys.argv[1]
- parent_pid = sys.argv[2]
- log_path = sys.argv[3]
- bench_log_path = sys.argv[4]
- report_path = sys.argv[5]
- nodes_to_monitor = sys.argv[6:]
- pid=os.fork()
- if pid: #parent
- print pid
- else: #child
- os.close(0)
- os.close(1)
- os.close(2)
- # log("child process start")
- signal.signal(signal.SIGTERM, sig_term_handler)
- start_monitor(log_path, nodes_to_monitor)
- while os.path.exists("/proc/%s" % parent_pid):
- sleep(1)
- # parent lost, stop!
- signal.signal(signal.SIGTERM, signal.SIG_IGN)
- na.stop()
- generate_report(workload_title, log_path, bench_log_path, report_path)
HiBench成长笔记——(9) 分析源码monitor.py的更多相关文章
- HiBench成长笔记——(10) 分析源码execute_with_log.py
#!/usr/bin/env python2 # Licensed to the Apache Software Foundation (ASF) under one or more # contri ...
- HiBench成长笔记——(8) 分析源码workload_functions.sh
workload_functions.sh 是测试程序的入口,粘连了监控程序 monitor.py 和 主运行程序: #!/bin/bash # Licensed to the Apache Soft ...
- HiBench成长笔记——(11) 分析源码run.sh
#!/bin/bash # Licensed to the Apache Software Foundation (ASF) under one or more # contributor licen ...
- HiBench成长笔记——(5) HiBench-Spark-SQL-Scan源码分析
run.sh #!/bin/bash # Licensed to the Apache Software Foundation (ASF) under one or more # contributo ...
- memcached学习笔记——存储命令源码分析下篇
上一篇回顾:<memcached学习笔记——存储命令源码分析上篇>通过分析memcached的存储命令源码的过程,了解了memcached如何解析文本命令和mencached的内存管理机制 ...
- memcached学习笔记——存储命令源码分析上篇
原创文章,转载请标明,谢谢. 上一篇分析过memcached的连接模型,了解memcached是如何高效处理客户端连接,这一篇分析memcached源码中的process_update_command ...
- [转]【安卓笔记】AsyncTask源码剖析
[转][安卓笔记]AsyncTask源码剖析 http://blog.csdn.net/chdjj/article/details/39122547 前言: 初学AsyncTask时,就想研究下它的实 ...
- Hadoop学习笔记(10) ——搭建源码学习环境
Hadoop学习笔记(10) ——搭建源码学习环境 上一章中,我们对整个hadoop的目录及源码目录有了一个初步的了解,接下来计划深入学习一下这头神象作品了.但是看代码用什么,难不成gedit?,单步 ...
- 【Azure 应用服务】Azure Function App使用SendGrid发送邮件遇见异常消息The operation was canceled,分析源码逐步最终源端
问题描述 在使用Azure Function App的SendGrid Binging功能,调用SendGrid服务器发送邮件功能时,有时候遇见间歇性,偶发性异常.在重新触发SendGrid部分的Fu ...
随机推荐
- JDBC连接MySql例子
1.注册MySql连接驱动 2.设置连接MySql连接字符串.用户名和密码 3.获取数据库连接 代码如下: // 加载驱动 Class.forName("com.mysql.jdbc.Dri ...
- 学习 Python,怎能不懂点PEP 呢?
或许你是一个初入门 Python 的小白,完全不知道 PEP 是什么.又或许你是个学会了 Python 的熟手,见过几个 PEP,却不知道这玩意背后是什么.那正好,本文将系统性地介绍一下 PEP,与大 ...
- 在 ubuntu 中安装python虚拟环境
直接看命令一路操作(注:python3 下): 1.安装虚拟环境: sudo pip3 install virtualenv 2.安装虚拟环境扩展管理工具: sudo pip3 install vir ...
- python3报:ImportError: No module named 'MySQLdb'
问题描述: 项目在转到python3.6时,原先的导入MySQLdb模块都提示无法导入,pip install mysqldb也安装失败. 问题原因: python2和python3在数据库模块支持这 ...
- Windows远程“要求的函数不受支持”解决办法
解决方法: 开启组策略中远程桌面链接安全层.1.开始-运行-gpedit.msc,进入组策略编辑器:2.找到左侧边栏计算机配置-管理模板-Windows组件-远程桌面服务-远程桌面会话主机-安全项:3 ...
- C++启动和关闭外部exe
转载:https://www.cnblogs.com/Sketch-liu/p/7277130.html 1.WinExec( lpCmdLine: LPCSTR; {文件名和参数; 如没指定路径会 ...
- Linux:vi & vim(待学)
VI编辑器_终端编辑器 目标 vi简介 打开和新建文件 三种工作模式 常用命令查询 1 简介 1.1 学习vi的目的 在工作中, 要对 服务器上的 文件进行 简单 的修改, 可以使用 ssh 登录到远 ...
- 时间和日期实例-<Calender计算出生日期相差几天>
String day1="1994:10:04"; String day2="1994:10:03"; SimpleDateFormat format= new ...
- 「Luogu P3078 [USACO13MAR]扑克牌型Poker Hands」
本题有\(O(N)\)的优秀做法,但是因为在考场上不一定能想到,就来分享一种\(O(N\log_2N)\)的做法.虽然有点慢,但是可以过. 前置芝士 线段树:提高组及以上必备内容,不会的同学可以学习一 ...
- uniGUI之新窗口uniForm(19)
然后 保存,在这里 重命名窗口 //主窗口 调用 // NewForm2.UniForm1.Show() ; //非阻塞 NewForm2.UniForm1.ShowModal();//阻塞 //子窗 ...