
  1. #!/usr/bin/env python2
  2. # Licensed to the Apache Software Foundation (ASF) under one or more
  3. # contributor license agreements. See the NOTICE file distributed with
  4. # this work for additional information regarding copyright ownership.
  5. # The ASF licenses this file to You under the Apache License, Version 2.0
  6. # (the "License"); you may not use this file except in compliance with
  7. # the License. You may obtain a copy of the License at
  8. #
  9. #
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  17. import threading, subprocess, re, os, sys, signal, socket
  18. from time import sleep, time
  19. from contextlib import closing
  20. import traceback, thread
  21. from datetime import datetime
  22. from collections import namedtuple
  23. from pprint import pprint
  24. from itertools import groupby
  26. # Probe intervals, in seconds.
  27. # Warning: a value too short may get wrong results due to lack of data when system load goes high.
  28. # and must be float!
  29. PROBE_INTERVAL=float(5)
  31. #FIXME: use log helper later
  32. #log_lock = threading.Lock()
  33. def log(*s):
  34. if len(s)==1: s=s[0]
  35. else: s= " ".join([str(x) for x in s])
  36. # with log_lock:
  37. # with open("/home/zhihui/monitor_proc.log", 'a') as f:
  38. log_str = str(thread.get_ident())+":"+str(s) +'\n'
  39. # f.write( log_str )
  40. sys.stderr.write(log_str)
  42. entered=False
  43. def sig_term_handler(signo, stack):
  44. global entered
  45. global log_path
  46. global report_path
  47. global workload_title
  48. global bench_log_path
  49. global na
  51. if not entered:
  52. entered=True # FIXME: Not atomic
  53. else: return
  55. na.stop()
  56. generate_report(workload_title, log_path, bench_log_path, report_path)
  57. sys.exit(0)
  59. def samedir(fn):
  60. """
  61. return abspath of fn in the same directory where this python file stores
  62. """
  63. return os.path.abspath(os.path.join(os.path.dirname(__file__), fn))
  65. class PatchedNameTuple(object):
  66. def __sub__(self, other):
  67. assert isinstance(other, self.__class__)
  68. assert self[0] == other[0]
  69. cls = self.__class__
  70. return cls(self[0], *[a-b for a, b in zip(self[1:], other[1:])])
  72. def __div__(self, other):
  73. return self.__class__(self[0], *[a/other for a in self[1:]])
  75. def _add(self, other, override_title=None):
  76. if other == None: return self
  77. assert isinstance(other, self.__class__)
  78. cls = self.__class__
  79. title = self[0] if not override_title else override_title
  80. return cls(title, *[a+b for a, b in zip(self[1:], other[1:])])
  82. def ident(size, s):
  83. return "\n".join((" "*size + x for x in s.split("\n")))
  85. class RemoteProc(threading.Thread):
  86. SEP="----SEP----"
  87. template_debug=r"""exec('
  88. import time, os, sys, socket, traceback
  89. socket.setdefaulttimeout(1)
  90. def log(*x, **kw):
  91. with open("/home/zhihui/probe.log", kw.get("mode","a")) as f:
  92. f.write(repr(x)+chr(10))
  93. try:
  94. log("create socket", mode="w")
  95. s=socket.socket(socket.AF_INET, socket.SOCK_STREAM)
  96. log("bind socket")
  97. s.bind(("",0))
  98. log("listen socket")
  99. s.listen(5)
  100. log("bind socket to:", s.getsockname())
  101. while True:
  102. log("accepting")
  103. try:
  104. print s.getsockname()[1]
  105. s2,peer=s.accept()
  106. break
  107. except socket.timeout:
  108. log("accept timeout, retry")
  109. log("accepted, peer:",peer)
  110. except Exception as e:
  111. import traceback
  112. log(traceback.format_exc())
  113. {func_template}
  114. while True:
  115. s2.send(("{SEP}+%s" % time.time())+chr(10))
  116. {call_template}
  117. s2.send("{SEP}#end"+chr(10))
  118. time.sleep({interval})
  119. ')"""
  120. template=r"""exec('
  121. import time, os, sys, socket, traceback
  122. s=socket.socket(socket.AF_INET, socket.SOCK_STREAM)
  123. s.bind(("",0))
  124. s.listen(5)
  125. print s.getsockname()[1]
  126. s2,peer=s.accept()
  127. {func_template}
  128. while True:
  129. s2.send(("{SEP}+%s" % time.time())+chr(10))
  130. {call_template}
  131. s2.send("{SEP}#end"+chr(10))
  132. time.sleep({interval})
  133. ')"""
  135. def __init__(self, host, interval=1):
  136. = host
  137. self.cmds = []
  138. self.interval = interval
  139. self.monitor_ins = {}
  140. self.local_aggr_container={}
  141. self._running=True
  143. super(RemoteProc, self).__init__()
  145. def register(self, monitor_ins, cmds):
  146. assert isinstance(monitor_ins, BaseMonitor)
  147. self.monitor_ins[len(self.cmds)] = monitor_ins # monitor command seq id => monitor instance
  148. self.cmds.append(cmds)
  150. def run(self):
  151. func_template = "\n".join(["def func_{id}():\n{func}"\
  152. .format(id=id,
  153. func=ident(2,
  154. func+'\ns2.send("{SEP}={id}"+chr(10))'\
  155. .format(SEP=self.SEP, id=id))) \
  156. for id, func in enumerate(self.cmds)])
  157. call_template="\n".join([" func_{id}()"\
  158. .format(id=id) for id in range(len(self.cmds))]
  159. )
  160. script = self.template.format(func_template=func_template,
  161. call_template=call_template,
  162. interval = self.interval,
  163. SEP = self.SEP)
  165. s = script.replace('"', r'\"').replace("\n", r"\n")
  166. container=[]
  167. # log("ssh client to:",
  168. with self.ssh_client(, "python -u -c \"{script}\"".format(script=s)) as f:
  169. # log("ssh client %s connected" %
  170. try:
  171. port_line = f.readline()
  172. # log("host:",, "got port,", port_line)
  173. port = int(port_line.rstrip())
  174. s=socket.socket(socket.AF_INET, socket.SOCK_STREAM)
  175. s.settimeout(0.5)
  176. for i in range(30): # try to connect 30 times maximum
  177. try:
  178. # log("try to connect:",, port)
  179. s.connect((, port))
  180. # log("connectted to:",, port)
  181. break
  182. except socket.timeout:
  183. # log("connecting to:",, port, "timedout")
  184. pass
  185. else: # not connectted after 30 times trying
  186. # log("cann't connectted to:",, port)
  187. s.shutdown(socket.SHUT_RDWR)
  188. self.ssh_close()
  189. return
  190. s.settimeout(None)
  191. except Exception as e:
  192. log(traceback.format_exc())
  194. with closing(s.makefile()) as f2:
  195. while self._running:
  196. try:
  197. l = f2.readline()
  198. except KeyboardInterrupt:
  199. break
  200. if not l: break
  201. if l.startswith(self.SEP):
  202. tail = l.lstrip(self.SEP)
  203. if tail[0]=='+': # timestamp
  204. remote_timestamp = float(tail[1:])
  205. cur_timestamp = time()
  206. elif tail.startswith('#end'): # end sign
  207. # log("na push, timestamp:", cur_timestamp)
  208. self.na_push(cur_timestamp)
  209. else:
  210. id = int(tail[1:])
  211. if self.monitor_ins[id]:
  212. self.monitor_ins[id].feed(container, cur_timestamp)
  213. container = []
  214. else:
  215. container.append(l.rstrip())
  216. s.shutdown(socket.SHUT_RDWR)
  217. self.ssh_close()
  219. def stop(self):
  220. self._running=False
  222. def aggregate(self, timestamp, data):
  223. if not self.local_aggr_container:
  224. self.local_aggr_container['timestamp']=timestamp
  225. assert timestamp == self.local_aggr_container['timestamp']
  226. assert type(data) is dict
  227. self.local_aggr_container.update(data)
  228. self.local_aggr_container['timestamp'] = timestamp
  230. def na_register(self, na):
  231. assert isinstance(na, NodeAggregator)
  232. self.node_aggr_parent = na
  234. def na_push(self, timestamp):
  235. if self.local_aggr_container:
  236. assert self.local_aggr_container.get('timestamp', -1) == timestamp
  237. self.node_aggr_parent.commit_aggregate(, self.local_aggr_container)
  238. self.local_aggr_container={}
  240. class BaseMonitor(object):
  241. IGNORE_KEYS=[]
  242. def __init__(self, rproc):
  243. self.rproc = rproc
  244. self._last = None
  246. def feed(self, container, timestamp): # override to parse pulled data files
  247. raise NotImplementedError()
  249. def ssh_client(self, host, shell): # override for opening ssh client
  250. raise NotImplementedError()
  252. def ssh_close(self): # override for clear up ssh client
  253. raise NotImplementedError()
  255. def commit(self, timestamp, header, stat):
  256. if self._last is None: self._last = stat
  257. else:
  258. stat_delta = dict([(header+'/'+k, stat[k] - self._last[k]) \
  259. for k in set(self._last.keys()).union(set(stat.keys()))\
  260. if k in stat and k in self._last and k not in self.IGNORE_KEYS
  261. ])
  262. self._last = stat
  263. # if header.startswith("net"):
  264. # print stat_delta
  265. stat_delta[header+'/total'] = reduce_patched(lambda a,b: a._add(b, 'total'), stat_delta.values())
  266. self.rproc.aggregate(timestamp, stat_delta)
  268. class BashSSHClientMixin(object):
  269. ssh_lock = threading.Lock()
  270. def ssh_client(self, host, shell):
  271. with open(os.devnull, 'rb', 0) as DEVNULL:
  272. with BashSSHClientMixin.ssh_lock:
  273. self.proc = subprocess.Popen(["ssh", host, shell], bufsize=1,
  274. stdin=DEVNULL, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
  275. return self.proc.stdout
  277. def ssh_close(self):
  278. assert self.proc
  279. self.proc.terminate()
  280. self.proc.wait()
  281. return self.proc.returncode
  283. _CPU=namedtuple("CPU", ['label', 'user', 'nice', 'system', 'idle', 'iowait', 'irq', 'softirq'])
  284. class CPU(_CPU, PatchedNameTuple):
  285. def percentage(self):
  286. total = sum(self[1:])
  287. return CPU(self[0], *[x*100.0 / total for x in self[1:]]) if total>0 else self
  289. class CPUMonitor(BaseMonitor):
  290. def __init__(self, rproc):
  291. super(CPUMonitor, self).__init__(rproc)
  292. rproc.register(self, """with open("/proc/stat") as f:
  293. s2.send("".join([x for x in f.readlines() if x.startswith("cpu")]))
  294. """)
  296. def feed(self, container, timestamp):
  297. "parse /proc/stat"
  298. self.commit(timestamp, dict([self._parse_stat(line) for line in container]))
  300. def _parse_stat(self, line):
  301. "parse one line of /proc/stat"
  302. assert line.strip(), "BUG! empty line in /proc/stat"
  303. fields = line.split()
  304. if fields[0]=='cpu':
  305. fields[0]='total'
  306. return (fields[0], CPU(fields[0], *[int(x) for x in fields[1:8]]))
  308. def commit(self, timestamp, cpu_stat):
  309. if self._last is None:
  310. self._last = cpu_stat
  311. else:
  312. cpu_usage = dict([("cpu/"+k, (cpu_stat[k] - self._last[k]).percentage()) for k in self._last])
  313. self._last = cpu_stat
  314. self.rproc.aggregate(timestamp, cpu_usage)
  316. _Network=namedtuple("Network", ['label', "recv_bytes", "recv_packets", "recv_errs", "recv_drop",
  317. "send_bytes", "send_packets", "send_errs", "send_drop"])
  318. class Network(_Network, PatchedNameTuple): pass
  320. class NetworkMonitor(BaseMonitor):
  321. IGNORE_KEYS=["lo"]
  322. def __init__(self, rproc):
  323. rproc.register(self, """with open("/proc/net/dev") as f:
  324. s2.send("".join([x for x in f.readlines()]))
  325. """)
  326. self._filter = re.compile('^\s*(.+):\s*(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+\d+\s+\d+\s+\d+\s+\d+\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+).*$')
  327. super(NetworkMonitor, self).__init__(rproc)
  329. def feed(self, container, timestamp):
  330. "parse /proc/net/dev"
  331. self.commit(timestamp, "net", dict(filter(lambda x:x, [self._parse_net_dev(line) for line in container])))
  333. def _parse_net_dev(self, line):
  334. matched = self._filter.match(line)
  335. if matched:
  336. obj = Network(matched.groups()[0], *[int(x) for x in matched.groups()[1:]])
  337. if not (obj.recv_bytes==0 and obj.send_bytes==0):
  338. return (obj[0], obj)
  340. _Disk=namedtuple("Disk", ["label", "io_read", "bytes_read", "time_spent_read", "io_write", "bytes_write", "time_spent_write"])
  342. class Disk(_Disk, PatchedNameTuple): pass
  344. class DiskMonitor(BaseMonitor):
  345. def __init__(self, rproc):
  346. super(DiskMonitor, self).__init__(rproc)
  347. rproc.register(self, """with open("/proc/diskstats") as f:
  348. blocks = os.listdir("/sys/block")
  349. s2.send("".join([x for x in f.readlines() if x.split()[2] in blocks and not x.split()[2].startswith("loop") and x.split()[3]!="0"]))
  350. """)
  352. def feed(self, container, timestamp):
  353. "parse /proc/diskstats"
  354. self.commit(timestamp, "disk", dict([self._parse_disk_stat(line) for line in container]))
  356. def _parse_disk_stat(self, line):
  357. fields = line.split()[2:]
  358. obj = Disk(fields[0],
  359. io_read=int(fields[1]), bytes_read=int(fields[3])*512, time_spent_read=int(fields[4])/1000.0,
  360. io_write=int(fields[5]), bytes_write=int(fields[7])*512, time_spent_write=int(fields[8])/1000.0)
  361. return (obj[0], obj)
  363. _Memory=namedtuple("Memory", ["label", "total", "used", "buffer_cache", "free", "map"])
  364. class Memory(_Memory, PatchedNameTuple): pass
  366. class MemoryMonitor(BaseMonitor):
  367. def __init__(self, rproc):
  368. super(MemoryMonitor, self).__init__(rproc)
  369. rproc.register(self, """with open("/proc/meminfo") as f:
  370. mem = dict([(a, b.split()[0].strip()) for a, b in [x.split(":") for x in f.readlines()]])
  371. s2.send(":".join([mem[field] for field in ["MemTotal", "Buffers", "Cached", "MemFree", "Mapped"]])+chr(10))
  372. """)
  374. def feed(self, memory_status, timestamp):
  375. "parse /proc/meminfo"
  376. total, buffers, cached, free, mapped= [int(x) for x in memory_status[0].split(":")]
  378. self.rproc.aggregate(timestamp, {"memory/total":Memory(label="total", total=total,
  379. used=total - free - buffers-cached,
  380. buffer_cache=buffers + cached,
  381. free=free, map=mapped)})
  382. _Proc=namedtuple("Proc", ["label", "load5", "load10", "load15", "running", "procs"])
  383. class Proc(_Proc, PatchedNameTuple): pass
  385. class ProcMonitor(BaseMonitor):
  386. def __init__(self, rproc):
  387. super(ProcMonitor, self).__init__(rproc)
  388. rproc.register(self, """with open("/proc/loadavg") as f:
  389. s2.send(
  390. """)
  392. def feed(self, load_status, timestamp):
  393. "parse /proc/meminfo"
  394. load5, load10, load15, running_procs= load_status[0].split()[:4]
  395. running, procs = running_procs.split('/')
  397. self.rproc.aggregate(timestamp, {"proc":Proc(label="total", load5=float(load5), load10=float(load10),
  398. load15=float(load15), running=int(running), procs=int(procs))})
  400. class NodeAggregator(object):
  401. def __init__(self, log_name):
  402. self.node_pool = {}
  403. self.log_name = log_name
  404. self.log_lock = threading.Lock()
  405. try:
  406. os.unlink(self.log_name)
  407. except OSError:
  408. pass
  410. def append(self, node):
  411. assert isinstance(node, RemoteProc)
  412. self.node_pool[] = node
  413. node.na_register(self)
  415. def commit_aggregate(self, node, datas):
  416. datas['hostname'] = node
  417. with self.log_lock:
  418. with file(self.log_name, "a") as f:
  419. f.write(repr(datas) + "\n")
  421. def run(self):
  422. for v in self.node_pool.values():
  423. v.start()
  425. def stop(self):
  426. for v in self.node_pool.values():
  427. v.stop()
  428. for v in self.node_pool.values():
  429. v.join()
  431. def round_to_base(v, b):
  432. """
  433. >>> round_to_base(0.1, 0.3)
  434. 0.0
  435. >>> round_to_base(0.3, 0.3)
  436. 0.3
  437. >>> round_to_base(0.0, 0.3)
  438. 0.0
  439. >>> round_to_base(0.5, 0.3)
  440. 0.3
  441. >>> round_to_base(0.51, 0.3)
  442. 0.3
  443. """
  444. for i in range(10):
  445. base = int(b * 10**i)
  446. if abs(base - b * 10**i) < 0.001: break
  447. assert base>0
  448. return float(int(v * 10**i) / base * base) / (10**i)
  450. def filter_dict_with_prefix(d, prefix, sort=True):
  451. keys = sorted(d.keys()) if sort else d.keys()
  452. if prefix[0]=='!':
  453. return dict([(x, d[x]) for x in keys if not x.startswith(prefix[1:])])
  454. else:
  455. return dict([(x, d[x]) for x in keys if x.startswith(prefix)])
  457. def reduce_patched(func, data):
  458. if len(data)==1:
  459. return data[0]
  460. elif len(data)==0:
  461. return data
  462. else:
  463. return reduce(func, data)
  465. def filter_dict_with_prefixes(d, *prefixes):
  466. if len(prefixes)==1:
  467. return filter_dict_with_prefix(d, prefixes[0])
  468. else:
  469. return reduce_patched(lambda a,b: filter_dict_with_prefix(filter_dict_with_prefix(d, a),b),
  470. prefixes)
  472. def test():
  473. p = BashSSHClientMixin()
  474. script=r"""exec('
  475. import time, os, sys
  476. while 1:
  477. with open("/proc/stat") as f: print,
  478. print "---hello---"
  479. time.sleep(1)
  480. ')"""
  481. s = script.replace('"', r'\"').replace("\n", r"\n")
  482. with p.ssh_client("localhost", "python -u -c \"{s}\"".format(s=s)) as f:
  483. while 1:
  484. l = f.readline()
  485. print l.rstrip()
  486. if not l: break
  487. p.ssh_close()
  489. def test2():
  490. class P(RemoteProc, BashSSHClientMixin): pass
  492. p = P("localhost", 0.3)
  493. CPUMonitor(p)
  494. NetworkMonitor(p)
  495. DiskMonitor(p)
  496. MemoryMonitor(p)
  500. def start_monitor(log_filename, nodes):
  501. class P(RemoteProc, BashSSHClientMixin):
  502. def __init__(self, *args):
  503. RemoteProc.__init__(self, *args)
  504. CPUMonitor(self)
  505. NetworkMonitor(self)
  506. DiskMonitor(self)
  507. MemoryMonitor(self)
  508. ProcMonitor(self)
  509. global na
  510. na = NodeAggregator(log_filename)
  511. nodes = sorted(list(set(nodes)))
  512. for node in nodes:
  513. na.append(P(node, PROBE_INTERVAL))
  516. def parse_bench_log(benchlog_fn):
  517. events=["x,event"]
  518. _spark_stage_submit = re.compile("^(\d{2}\/\d{2}\/\d{2} \d{2}:\d{2}:\d{2}) INFO [a-zA-Z0-9_\.]*DAGScheduler: Submitting (Stage \d+) \((.*)\).+$") # submit spark stage
  519. _spark_stage_finish = re.compile("^(\d{2}\/\d{2}\/\d{2} \d{2}:\d{2}:\d{2}) INFO [a-zA-Z0-9_\.]*DAGScheduler: (Stage \d+) \((.*)\) finished.+$") # spark stage finish
  520. _hadoop_run_job = re.compile("^(\d{2}\/\d{2}\/\d{2} \d{2}:\d{2}:\d{2}) INFO mapred.*\.Job.*: Running job: job_([\d_]+)$") # hadoop run job
  521. _hadoop_map_reduce_progress = re.compile("^(\d{2}\/\d{2}\/\d{2} \d{2}:\d{2}:\d{2}) INFO mapred.*\.Job.*:\s+map (\d{1,2})% reduce (\d{1,2})%$") # hadoop reduce progress
  522. _hadoop_job_complete_mr1 = re.compile("^(\d{2}\/\d{2}\/\d{2} \d{2}:\d{2}:\d{2}) INFO mapred.JobClient: Job complete: job_([\d_]+)$")
  523. _hadoop_job_complete_mr2 = re.compile("^(\d{2}\/\d{2}\/\d{2} \d{2}:\d{2}:\d{2}) INFO mapreduce.Job: Job job_([\d_]+) completed successfully$")
  525. """
  526. # MR1 sample
  527. 14/06/24 11:18:39 INFO mapred.JobClient: Running job: job_201406241116_0001
  528. 14/06/24 11:18:40 INFO mapred.JobClient: map 0% reduce 0%
  529. ...
  530. 13/11/21 14:38:55 INFO mapred.JobClient: Job complete: job_201311150128_0050
  532. # MR2 sample
  533. 15/04/10 17:20:01 INFO mapreduce.Job: Running job: job_1427781540447_0448
  534. 15/04/10 17:20:07 INFO mapreduce.Job: Job job_1427781540447_0448 running in uber mode : false
  535. 15/04/10 17:20:07 INFO mapreduce.Job: map 0% reduce 0%
  536. ...
  537. 15/04/10 17:20:25 INFO mapreduce.Job: Job job_1427781540447_0448 completed successfully
  538. """
  539. flag={}
  540. with open(benchlog_fn) as f:
  541. while True:
  542. line = f.readline().rstrip()
  543. if not line: break
  544. for rule in [_spark_stage_submit, _spark_stage_finish, _hadoop_run_job, _hadoop_map_reduce_progress, _hadoop_job_complete_mr1, _hadoop_job_complete_mr2]:
  545. matched = rule.match(line)
  546. if matched:
  547. result = matched.groups()
  548. timestamp = datetime.strptime(result[0], r" # convert to millsec for js
  549. if rule is _spark_stage_submit:
  550. events.append("{t},Start {v1} ({v2})".format(t=timestamp, v1=result[1], v2=result[2]))
  551. elif rule is _spark_stage_finish:
  552. events.append("{t},Finish {v1} ({v2})".format(t=timestamp, v1=result[1], v2=result[2]))
  553. elif rule is _hadoop_run_job:
  554. events.append("{t},Start Job {v1}".format(t=timestamp, v1=result[1]))
  555. flag={}
  556. elif rule is _hadoop_map_reduce_progress:
  557. map_progress,reduce_progress = int(result[1]), int(result[2])
  558. op={'map':False, 'reduce':False}
  559. if map_progress == 100:
  560. if not "map" in flag:
  561. op['map'] = True
  562. flag['map'] = True
  563. elif reduce_progress>0:
  564. if not 'reduce' in flag:
  565. op['reduce'] = True
  566. flag['reduce'] = True
  567. if op['map'] and op['reduce']:
  568. events.append("{t},Map finish and Reduce start".format(t=timestamp))
  569. elif op['map']:
  570. events.append("{t},Map finish".format(t=timestamp))
  571. elif op['reduce']:
  572. events.append("{t},Reduce start".format(t=timestamp))
  573. elif rule is _hadoop_job_complete_mr1 or rule is _hadoop_job_complete_mr2:
  574. events.append("{t},Finsih Job {v1}".format(t=timestamp, v1=result[1]))
  575. else:
  576. assert 0, "should never reach here"
  578. # limit maximum string length of events
  579. for i in range(len(events)):
  580. event_time, event_str = re.split(',', events[i], 1)
  581. if len(event_str) > 45:
  582. event_str = event_str[:21]+ '...' + event_str[-21:]
  583. events[i]="%s,%s" % (event_time, event_str)
  585. # merge events occurred at sametime:
  586. i = 1
  587. while i < len(events)-1:
  588. cur = events[i].split(',')[0]
  589. next = events[i+1].split(',')[0]
  590. if abs(int(cur)/1000 - int(next)/1000) < 1:
  591. events[i] = events[i] + "<br>" + re.split(',', events[i+1], 1)[1]
  592. del events[i+1]
  593. continue
  594. i += 1
  595. return events
  597. def generate_report(workload_title, log_fn, benchlog_fn, report_fn):
  598. c =- 1
  599. with open(log_fn) as f:
  600. datas=[eval(x) for x in f.readlines()]
  602. all_hosts = sorted(list(set([x['hostname'] for x in datas])))
  603. data_slices = groupby(datas, lambda x:round_to_base(x['timestamp'], PROBE_INTERVAL)) # round to time interval and groupby
  605. # Generating CSVs
  606. cpu_heatmap = ["x,y,value,hostname,coreid"]
  607. cpu_overall = ["x,idle,user,system,iowait,others"]
  608. network_heatmap = ["x,y,value,hostname,adapterid"]
  609. network_overall = ["x,recv_bytes,send_bytes,|recv_packets,send_packets,errors"]
  610. diskio_heatmap = ["x,y,value,hostname,diskid"]
  611. diskio_overall = ["x,read_bytes,write_bytes,|read_io,write_io"]
  612. memory_heatmap = ["x,y,value,hostname"]
  613. memory_overall = ["x,free,buffer_cache,used"]
  614. procload_heatmap = ["x,y,value,hostname"]
  615. procload_overall = ["x,load5,load10,load15,|running,procs"]
  616. events = parse_bench_log(benchlog_fn)
  618. cpu_count={}
  619. network_count={}
  620. diskio_count={}
  621. memory_count={}
  622. proc_count={}
  624. for t, sub_data in data_slices:
  625. classed_by_host = dict([(x['hostname'], x) for x in sub_data])
  626. # total cpus, plot user/sys/iowait/other
  627. data_by_all_hosts = [classed_by_host.get(h, {}) for h in all_hosts]
  629. # all cpu cores, total cluster
  630. summed1 = [x['cpu/total'] for x in data_by_all_hosts if x.has_key('cpu/total')]
  631. if summed1:
  632. summed = reduce_patched(lambda a,b: a._add(b), summed1) / len(summed1)
  633. for x in data_by_all_hosts:
  634. cpu = x.get('cpu/total', None)
  635. if not cpu: continue
  636. # user, system, io, idle, others
  637. # print t, x['hostname'], cpu.user, cpu.system, cpu.iowait, cpu.idle, cpu.nice+cpu.irq+cpu.softirq
  638. # print t, summed
  639. cpu_overall.append("{time},{idle},{user},{system},{iowait},{others}" \
  640. .format(time = int(t*1000), user = summed.user, system = summed.system,
  641. iowait = summed.iowait, idle = summed.idle,
  642. others = summed.nice + summed.irq + summed.softirq))
  644. # all cpu cores, plot heatmap according to cpus/time/usage(100%-idle)
  645. for idx, x in enumerate(data_by_all_hosts):
  646. for idy, y in enumerate(filter_dict_with_prefixes(x, "cpu", "!cpu/total").values()):
  647. try:
  648. pos = cpu_count[(idx, idy, x['hostname'])]
  649. except:
  650. pos = len(cpu_count)
  651. cpu_count[(idx, idy, x['hostname'])] = pos
  652. # print t, pos, 100-y.idle, x['hostname'], y.label
  653. cpu_heatmap.append("{time},{pos},{value},{host},{cpuid}" \
  654. .format(time = int(t*1000), pos = pos, value = 100-y.idle,
  655. host = x['hostname'], cpuid = y.label))
  657. # all disk of each node, total cluster
  658. summed1=[x['disk/total'] for x in data_by_all_hosts if x.has_key('disk/total')]
  659. if summed1:
  660. summed = reduce_patched(lambda a,b: a._add(b), summed1)
  661. for x in data_by_all_hosts:
  662. disk = x.get('disk/total', None)
  663. if not disk: continue
  664. # io-read, io-write, bytes-read, bytes-write
  665. # print t, x['hostname'], disk.io_read, disk.io_write, disk.bytes_read, disk.bytes_write
  666. # print t, summed
  667. diskio_overall.append("{time},{bytes_read},{bytes_write},{io_read},{io_write}" \
  668. .format(time = int(t*1000),
  669. bytes_read = summed.bytes_read / PROBE_INTERVAL,
  670. bytes_write = summed.bytes_write / PROBE_INTERVAL,
  671. io_read = summed.io_read / PROBE_INTERVAL,
  672. io_write = summed.io_write / PROBE_INTERVAL))
  674. # all disks, plot heatmap according to disks/bytes_read+bytes_write
  675. for idx, x in enumerate(data_by_all_hosts):
  676. for idy, y in enumerate(filter_dict_with_prefixes(x, "disk", "!disk/total").values()):
  677. try:
  678. pos = diskio_count[(idx, idy, x['hostname'])]
  679. except:
  680. pos = len(diskio_count)
  681. diskio_count[(idx, idy, x['hostname'])] = pos
  682. # print t, pos, 100-y.idle, x['hostname'], y.label
  683. diskio_heatmap.append("{time},{pos},{value},{host},{diskid}" \
  684. .format(time = int(t*1000),
  685. pos = pos,
  686. value = (y.bytes_read + y.bytes_write) / PROBE_INTERVAL,
  687. host = x['hostname'],
  688. diskid = y.label))
  690. # memory of each node, total cluster
  691. summed1 = [x['memory/total'] for x in data_by_all_hosts if x.has_key('memory/total')]
  692. if summed1:
  693. summed = reduce_patched(lambda a,b: a._add(b), summed1)
  694. for x in data_by_all_hosts:
  695. mem = x.get("memory/total", None)
  696. if not mem: continue
  697. # mem-total, mem-used, mem-buffer&cache, mem-free, KB
  698. # print t, x['hostname'],, mem.used, mem.buffer_cache,
  699. #print t, summed
  700. memory_overall.append("{time},{free},{buffer_cache},{used}" \
  701. .format(time = int(t*1000),
  702. free =,
  703. used = summed.used,
  704. buffer_cache = summed.buffer_cache))
  706. # all memory, plot heatmap according to memory/total - free
  707. for idx, x in enumerate(data_by_all_hosts):
  708. for idy, y in enumerate(filter_dict_with_prefixes(x, "memory/total").values()):
  709. try:
  710. pos = memory_count[(idx, idy, x['hostname'])]
  711. except:
  712. pos = len(memory_count)
  713. memory_count[(idx, idy, x['hostname'])] = pos
  714. # print t, pos, 100-y.idle, x['hostname'], y.label
  715. memory_heatmap.append("{time},{pos},{value},{host}" \
  716. .format(time = int(t*1000),
  717. pos = pos,
  718. value = ( -*1000,
  719. host = x['hostname']))
  721. # proc of each node, total cluster
  722. summed1 = [x['proc'] for x in data_by_all_hosts if x.has_key('proc')]
  723. if summed1:
  724. summed = reduce_patched(lambda a,b: a._add(b), summed1)
  725. for x in data_by_all_hosts:
  726. procs = x.get("proc", None)
  727. if not procs: continue
  728. procload_overall.append("{time},{load5},{load10},{load15},{running},{procs}"\
  729. .format(time = int(t*1000),
  730. load5 = summed.load5,load10=summed.load10,
  731. load15 = summed.load15,running=summed.running,
  732. procs = summed.procs))
  734. # all nodes' proc, plot heatmap according to proc/proc.procs
  735. for idx, x in enumerate(data_by_all_hosts):
  736. for idy, y in enumerate(filter_dict_with_prefixes(x, "proc").values()):
  737. try:
  738. pos = proc_count[(idx, idy, x['hostname'])]
  739. except:
  740. pos = len(proc_count)
  741. proc_count[(idx, idy, x['hostname'])] = pos
  742. # print t, pos, 100-y.idle, x['hostname'], y.label
  743. procload_heatmap.append("{time},{pos},{value},{host}" \
  744. .format(time = int(t*1000), pos = pos, value = y.procs,
  745. host = x['hostname']))
  747. # all network interface, total cluster
  748. summed1 = [x['net/total'] for x in data_by_all_hosts if x.has_key('net/total')]
  750. if summed1:
  751. summed = reduce_patched(lambda a,b: a._add(b), summed1)
  752. for x in data_by_all_hosts:
  753. net = x.get("net/total", None)
  754. if not net: continue
  755. # recv-byte, send-byte, recv-packet, send-packet, errors
  756. # print t, x['hostname'], net.recv_bytes, net.send_bytes, net.recv_packets, net.send_packets, net.recv_errs+net.send_errs+net.recv_drop+net.send_drop
  757. # print t, summed
  758. network_overall.append("{time},{recv_bytes},{send_bytes},{recv_packets},{send_packets},{errors}" \
  759. .format(time = int(t*1000),
  760. recv_bytes = summed.recv_bytes / PROBE_INTERVAL,
  761. send_bytes = summed.send_bytes / PROBE_INTERVAL,
  762. recv_packets = summed.recv_packets / PROBE_INTERVAL,
  763. send_packets = summed.send_packets / PROBE_INTERVAL,
  764. errors = (summed.recv_errs + summed.send_errs + \
  765. summed.recv_drop + summed.send_drop) / PROBE_INTERVAL)
  766. )
  768. # all network adapters, plot heatmap according to net/recv_bytes + send_bytes
  769. for idx, x in enumerate(data_by_all_hosts):
  770. for idy, y in enumerate(filter_dict_with_prefixes(x, "net", "!net/total").values()):
  771. try:
  772. pos = network_count[(idx, idy, x['hostname'])]
  773. except:
  774. pos = len(network_count)
  775. network_count[(idx, idy, x['hostname'])] = pos
  776. network_heatmap.append("{time},{pos},{value},{host},{networkid}" \
  777. .format(time = int(t*1000),
  778. pos = pos*2,
  779. value = y.recv_bytes / PROBE_INTERVAL,
  780. host = x['hostname'],
  781. networkid = y.label+".recv"))
  782. network_heatmap.append("{time},{pos},{value},{host},{networkid}" \
  783. .format(time = int(t*1000),
  784. pos = pos*2+1,
  785. value = y.send_bytes / PROBE_INTERVAL,
  786. host = x['hostname'],
  787. networkid = y.label+".send"))
  789. with open(samedir("chart-template.html")) as f:
  790. template =
  792. variables = locals()
  793. def my_replace(match):
  794. match =[1:-1]
  795. if match.endswith('heatmap') or match.endswith('overall'):
  796. return "\n".join(variables[match])
  797. elif match =='events':
  798. return "\n".join(events)
  799. elif match == 'probe_interval':
  800. return str(PROBE_INTERVAL * 1000)
  801. elif match == 'workload_name':
  802. return workload_title
  803. else:
  804. return '{%s}' % match
  806. with open(report_fn, 'w') as f:
  807. f.write(re.sub(r'{\w+}', my_replace, template))
  809. def show_usage():
  810. log("""Usage:
  811. <workload_title> <parent_pid> <log_path.log> <benchlog_fn.log> <report_path.html> <monitor_node_name1> ... <monitor_node_nameN>
  812. """)
  814. if __name__=="__main__":
  815. if len(sys.argv)<6:
  816. log(sys.argv)
  817. show_usage()
  818. sys.exit(1)
  820. # log(sys.argv)
  821. global log_path
  822. global report_path
  823. global workload_title
  824. global bench_log_path
  825. global na
  827. workload_title = sys.argv[1]
  828. parent_pid = sys.argv[2]
  829. log_path = sys.argv[3]
  830. bench_log_path = sys.argv[4]
  831. report_path = sys.argv[5]
  832. nodes_to_monitor = sys.argv[6:]
  833. pid=os.fork()
  834. if pid: #parent
  835. print pid
  836. else: #child
  837. os.close(0)
  838. os.close(1)
  839. os.close(2)
  840. # log("child process start")
  841. signal.signal(signal.SIGTERM, sig_term_handler)
  842. start_monitor(log_path, nodes_to_monitor)
  843. while os.path.exists("/proc/%s" % parent_pid):
  844. sleep(1)
  845. # parent lost, stop!
  846. signal.signal(signal.SIGTERM, signal.SIG_IGN)
  847. na.stop()
  848. generate_report(workload_title, log_path, bench_log_path, report_path)

